In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
In [2]:
data=pd.read_csv("dataset/cancer_reg.csv",encoding = "ISO-8859-1")
In [3]:
data.head()
Out[3]:
avgAnnCount avgDeathsPerYear TARGET_deathRate incidenceRate medIncome popEst2015 povertyPercent studyPerCap binnedInc MedianAge ... PctPrivateCoverageAlone PctEmpPrivCoverage PctPublicCoverage PctPublicCoverageAlone PctWhite PctBlack PctAsian PctOtherRace PctMarriedHouseholds BirthRate
0 1397.0 469 164.9 489.8 61898 260131 11.2 499.748204 (61494.5, 125635] 39.3 ... NaN 41.6 32.9 14.0 81.780529 2.594728 4.821857 1.843479 52.856076 6.118831
1 173.0 70 161.3 411.6 48127 43269 18.6 23.111234 (48021.6, 51046.4] 33.0 ... 53.8 43.6 31.1 15.3 89.228509 0.969102 2.246233 3.741352 45.372500 4.333096
2 102.0 50 174.7 349.7 49348 21026 14.6 47.560164 (48021.6, 51046.4] 45.0 ... 43.5 34.9 42.1 21.1 90.922190 0.739673 0.465898 2.747358 54.444868 3.729488
3 427.0 202 194.8 430.4 44243 75882 17.1 342.637253 (42724.4, 45201] 42.8 ... 40.3 35.0 45.3 25.0 91.744686 0.782626 1.161359 1.362643 51.021514 4.603841
4 57.0 26 144.4 350.1 49955 10321 12.5 0.000000 (48021.6, 51046.4] 48.3 ... 43.9 35.1 44.0 22.7 94.104024 0.270192 0.665830 0.492135 54.027460 6.796657

5 rows × 34 columns

In [4]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3047 entries, 0 to 3046
Data columns (total 34 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   avgAnnCount              3047 non-null   float64
 1   avgDeathsPerYear         3047 non-null   int64  
 2   TARGET_deathRate         3047 non-null   float64
 3   incidenceRate            3047 non-null   float64
 4   medIncome                3047 non-null   int64  
 5   popEst2015               3047 non-null   int64  
 6   povertyPercent           3047 non-null   float64
 7   studyPerCap              3047 non-null   float64
 8   binnedInc                3047 non-null   object 
 9   MedianAge                3047 non-null   float64
 10  MedianAgeMale            3047 non-null   float64
 11  MedianAgeFemale          3047 non-null   float64
 12  Geography                3047 non-null   object 
 13  AvgHouseholdSize         3047 non-null   float64
 14  PercentMarried           3047 non-null   float64
 15  PctNoHS18_24             3047 non-null   float64
 16  PctHS18_24               3047 non-null   float64
 17  PctSomeCol18_24          762 non-null    float64
 18  PctBachDeg18_24          3047 non-null   float64
 19  PctHS25_Over             3047 non-null   float64
 20  PctBachDeg25_Over        3047 non-null   float64
 21  PctEmployed16_Over       2895 non-null   float64
 22  PctUnemployed16_Over     3047 non-null   float64
 23  PctPrivateCoverage       3047 non-null   float64
 24  PctPrivateCoverageAlone  2438 non-null   float64
 25  PctEmpPrivCoverage       3047 non-null   float64
 26  PctPublicCoverage        3047 non-null   float64
 27  PctPublicCoverageAlone   3047 non-null   float64
 28  PctWhite                 3047 non-null   float64
 29  PctBlack                 3047 non-null   float64
 30  PctAsian                 3047 non-null   float64
 31  PctOtherRace             3047 non-null   float64
 32  PctMarriedHouseholds     3047 non-null   float64
 33  BirthRate                3047 non-null   float64
dtypes: float64(29), int64(3), object(2)
memory usage: 809.5+ KB
In [5]:
data.describe(include='all')
Out[5]:
avgAnnCount avgDeathsPerYear TARGET_deathRate incidenceRate medIncome popEst2015 povertyPercent studyPerCap binnedInc MedianAge ... PctPrivateCoverageAlone PctEmpPrivCoverage PctPublicCoverage PctPublicCoverageAlone PctWhite PctBlack PctAsian PctOtherRace PctMarriedHouseholds BirthRate
count 3047.000000 3047.000000 3047.000000 3047.000000 3047.000000 3.047000e+03 3047.000000 3047.000000 3047 3047.000000 ... 2438.000000 3047.000000 3047.000000 3047.000000 3047.000000 3047.000000 3047.000000 3047.000000 3047.000000 3047.000000
unique NaN NaN NaN NaN NaN NaN NaN NaN 10 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
top NaN NaN NaN NaN NaN NaN NaN NaN (45201, 48021.6] NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
freq NaN NaN NaN NaN NaN NaN NaN NaN 306 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
mean 606.338544 185.965868 178.664063 448.268586 47063.281917 1.026374e+05 16.878175 155.399415 NaN 45.272333 ... 48.453774 41.196324 36.252642 19.240072 83.645286 9.107978 1.253965 1.983523 51.243872 5.640306
std 1416.356223 504.134286 27.751511 54.560733 12040.090836 3.290592e+05 6.409087 529.628366 NaN 45.304480 ... 10.083006 9.447687 7.841741 6.113041 16.380025 14.534538 2.610276 3.517710 6.572814 1.985816
min 6.000000 3.000000 59.700000 201.300000 22640.000000 8.270000e+02 3.200000 0.000000 NaN 22.300000 ... 15.700000 13.500000 11.200000 2.600000 10.199155 0.000000 0.000000 0.000000 22.992490 0.000000
25% 76.000000 28.000000 161.200000 420.300000 38882.500000 1.168400e+04 12.150000 0.000000 NaN 37.700000 ... 41.000000 34.500000 30.900000 14.850000 77.296180 0.620675 0.254199 0.295172 47.763063 4.521419
50% 171.000000 61.000000 178.100000 453.549422 45207.000000 2.664300e+04 15.900000 0.000000 NaN 41.000000 ... 48.700000 41.100000 36.300000 18.800000 90.059774 2.247576 0.549812 0.826185 51.669941 5.381478
75% 518.000000 149.000000 195.200000 480.850000 52492.000000 6.867100e+04 20.400000 83.650776 NaN 44.000000 ... 55.600000 47.700000 41.550000 23.100000 95.451693 10.509732 1.221037 2.177960 55.395132 6.493677
max 38150.000000 14010.000000 362.800000 1206.900000 125635.000000 1.017029e+07 47.400000 9762.308998 NaN 624.000000 ... 78.900000 70.700000 65.100000 46.600000 100.000000 85.947799 42.619425 41.930251 78.075397 21.326165

11 rows × 34 columns

handling missing data

In [6]:
missing=data.isnull()
missing.head()
Out[6]:
avgAnnCount avgDeathsPerYear TARGET_deathRate incidenceRate medIncome popEst2015 povertyPercent studyPerCap binnedInc MedianAge ... PctPrivateCoverageAlone PctEmpPrivCoverage PctPublicCoverage PctPublicCoverageAlone PctWhite PctBlack PctAsian PctOtherRace PctMarriedHouseholds BirthRate
0 False False False False False False False False False False ... True False False False False False False False False False
1 False False False False False False False False False False ... False False False False False False False False False False
2 False False False False False False False False False False ... False False False False False False False False False False
3 False False False False False False False False False False ... False False False False False False False False False False
4 False False False False False False False False False False ... False False False False False False False False False False

5 rows × 34 columns

In [7]:
for column in missing.columns.values.tolist():
    print(column)
    print(missing[column].value_counts())
    print("")
avgAnnCount
False    3047
Name: avgAnnCount, dtype: int64

avgDeathsPerYear
False    3047
Name: avgDeathsPerYear, dtype: int64

TARGET_deathRate
False    3047
Name: TARGET_deathRate, dtype: int64

incidenceRate
False    3047
Name: incidenceRate, dtype: int64

medIncome
False    3047
Name: medIncome, dtype: int64

popEst2015
False    3047
Name: popEst2015, dtype: int64

povertyPercent
False    3047
Name: povertyPercent, dtype: int64

studyPerCap
False    3047
Name: studyPerCap, dtype: int64

binnedInc
False    3047
Name: binnedInc, dtype: int64

MedianAge
False    3047
Name: MedianAge, dtype: int64

MedianAgeMale
False    3047
Name: MedianAgeMale, dtype: int64

MedianAgeFemale
False    3047
Name: MedianAgeFemale, dtype: int64

Geography
False    3047
Name: Geography, dtype: int64

AvgHouseholdSize
False    3047
Name: AvgHouseholdSize, dtype: int64

PercentMarried
False    3047
Name: PercentMarried, dtype: int64

PctNoHS18_24
False    3047
Name: PctNoHS18_24, dtype: int64

PctHS18_24
False    3047
Name: PctHS18_24, dtype: int64

PctSomeCol18_24
True     2285
False     762
Name: PctSomeCol18_24, dtype: int64

PctBachDeg18_24
False    3047
Name: PctBachDeg18_24, dtype: int64

PctHS25_Over
False    3047
Name: PctHS25_Over, dtype: int64

PctBachDeg25_Over
False    3047
Name: PctBachDeg25_Over, dtype: int64

PctEmployed16_Over
False    2895
True      152
Name: PctEmployed16_Over, dtype: int64

PctUnemployed16_Over
False    3047
Name: PctUnemployed16_Over, dtype: int64

PctPrivateCoverage
False    3047
Name: PctPrivateCoverage, dtype: int64

PctPrivateCoverageAlone
False    2438
True      609
Name: PctPrivateCoverageAlone, dtype: int64

PctEmpPrivCoverage
False    3047
Name: PctEmpPrivCoverage, dtype: int64

PctPublicCoverage
False    3047
Name: PctPublicCoverage, dtype: int64

PctPublicCoverageAlone
False    3047
Name: PctPublicCoverageAlone, dtype: int64

PctWhite
False    3047
Name: PctWhite, dtype: int64

PctBlack
False    3047
Name: PctBlack, dtype: int64

PctAsian
False    3047
Name: PctAsian, dtype: int64

PctOtherRace
False    3047
Name: PctOtherRace, dtype: int64

PctMarriedHouseholds
False    3047
Name: PctMarriedHouseholds, dtype: int64

BirthRate
False    3047
Name: BirthRate, dtype: int64

Based on the summary above, each column has 3047 rows of data,

3 columns contain missing values

1.PctSomeCol18_24 : 2285
2.PctEmployed16_Over :  152 
3.PctPrivateCoverageAlone :  609

PctSomeCol18_24 has 74% missing value
PctEmployed16_Over has 4% missing value
PctPrivateCoverageAlone has 19.98% missing value

so we can drop the PctSomeCol18_24 column and replace the missing value of PctEmployed16_Over, PctPrivateCoverageAlone with there mean

Replaceing missing

In [8]:
avg_PctEmp=data['PctEmployed16_Over'].astype('float').mean(axis=0)
print("avg loss of PctEmployed16_Over",avg_PctEmp)
avg_PctPri=data['PctPrivateCoverageAlone'].astype('float').mean(axis=0)
print("avg loss of PctPrivateCoverageAlone",avg_PctPri)
avg loss of PctEmployed16_Over 54.15264248704645
avg loss of PctPrivateCoverageAlone 48.45377358490559
In [9]:
data['PctEmployed16_Over'].replace(np.nan,avg_PctEmp,inplace=True)
data['PctPrivateCoverageAlone'].replace(np.nan,avg_PctEmp,inplace=True)
In [10]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3047 entries, 0 to 3046
Data columns (total 34 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   avgAnnCount              3047 non-null   float64
 1   avgDeathsPerYear         3047 non-null   int64  
 2   TARGET_deathRate         3047 non-null   float64
 3   incidenceRate            3047 non-null   float64
 4   medIncome                3047 non-null   int64  
 5   popEst2015               3047 non-null   int64  
 6   povertyPercent           3047 non-null   float64
 7   studyPerCap              3047 non-null   float64
 8   binnedInc                3047 non-null   object 
 9   MedianAge                3047 non-null   float64
 10  MedianAgeMale            3047 non-null   float64
 11  MedianAgeFemale          3047 non-null   float64
 12  Geography                3047 non-null   object 
 13  AvgHouseholdSize         3047 non-null   float64
 14  PercentMarried           3047 non-null   float64
 15  PctNoHS18_24             3047 non-null   float64
 16  PctHS18_24               3047 non-null   float64
 17  PctSomeCol18_24          762 non-null    float64
 18  PctBachDeg18_24          3047 non-null   float64
 19  PctHS25_Over             3047 non-null   float64
 20  PctBachDeg25_Over        3047 non-null   float64
 21  PctEmployed16_Over       3047 non-null   float64
 22  PctUnemployed16_Over     3047 non-null   float64
 23  PctPrivateCoverage       3047 non-null   float64
 24  PctPrivateCoverageAlone  3047 non-null   float64
 25  PctEmpPrivCoverage       3047 non-null   float64
 26  PctPublicCoverage        3047 non-null   float64
 27  PctPublicCoverageAlone   3047 non-null   float64
 28  PctWhite                 3047 non-null   float64
 29  PctBlack                 3047 non-null   float64
 30  PctAsian                 3047 non-null   float64
 31  PctOtherRace             3047 non-null   float64
 32  PctMarriedHouseholds     3047 non-null   float64
 33  BirthRate                3047 non-null   float64
dtypes: float64(29), int64(3), object(2)
memory usage: 809.5+ KB

Droping columns

In [11]:
data.drop(['PctSomeCol18_24'],axis=1,inplace=True)
In [12]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3047 entries, 0 to 3046
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   avgAnnCount              3047 non-null   float64
 1   avgDeathsPerYear         3047 non-null   int64  
 2   TARGET_deathRate         3047 non-null   float64
 3   incidenceRate            3047 non-null   float64
 4   medIncome                3047 non-null   int64  
 5   popEst2015               3047 non-null   int64  
 6   povertyPercent           3047 non-null   float64
 7   studyPerCap              3047 non-null   float64
 8   binnedInc                3047 non-null   object 
 9   MedianAge                3047 non-null   float64
 10  MedianAgeMale            3047 non-null   float64
 11  MedianAgeFemale          3047 non-null   float64
 12  Geography                3047 non-null   object 
 13  AvgHouseholdSize         3047 non-null   float64
 14  PercentMarried           3047 non-null   float64
 15  PctNoHS18_24             3047 non-null   float64
 16  PctHS18_24               3047 non-null   float64
 17  PctBachDeg18_24          3047 non-null   float64
 18  PctHS25_Over             3047 non-null   float64
 19  PctBachDeg25_Over        3047 non-null   float64
 20  PctEmployed16_Over       3047 non-null   float64
 21  PctUnemployed16_Over     3047 non-null   float64
 22  PctPrivateCoverage       3047 non-null   float64
 23  PctPrivateCoverageAlone  3047 non-null   float64
 24  PctEmpPrivCoverage       3047 non-null   float64
 25  PctPublicCoverage        3047 non-null   float64
 26  PctPublicCoverageAlone   3047 non-null   float64
 27  PctWhite                 3047 non-null   float64
 28  PctBlack                 3047 non-null   float64
 29  PctAsian                 3047 non-null   float64
 30  PctOtherRace             3047 non-null   float64
 31  PctMarriedHouseholds     3047 non-null   float64
 32  BirthRate                3047 non-null   float64
dtypes: float64(28), int64(3), object(2)
memory usage: 785.7+ KB

Visualising the data

In [13]:
sns.pairplot(data)
plt.show()
In [13]:
x=data
In [14]:
x.describe()
Out[14]:
avgAnnCount avgDeathsPerYear TARGET_deathRate incidenceRate medIncome popEst2015 povertyPercent studyPerCap MedianAge MedianAgeMale ... PctPrivateCoverageAlone PctEmpPrivCoverage PctPublicCoverage PctPublicCoverageAlone PctWhite PctBlack PctAsian PctOtherRace PctMarriedHouseholds BirthRate
count 3047.000000 3047.000000 3047.000000 3047.000000 3047.000000 3.047000e+03 3047.000000 3047.000000 3047.000000 3047.000000 ... 3047.000000 3047.000000 3047.000000 3047.000000 3047.000000 3047.000000 3047.000000 3047.000000 3047.000000 3047.000000
mean 606.338544 185.965868 178.664063 448.268586 47063.281917 1.026374e+05 16.878175 155.399415 45.272333 39.570725 ... 49.592799 41.196324 36.252642 19.240072 83.645286 9.107978 1.253965 1.983523 51.243872 5.640306
std 1416.356223 504.134286 27.751511 54.560733 12040.090836 3.290592e+05 6.409087 529.628366 45.304480 5.226017 ... 9.302461 9.447687 7.841741 6.113041 16.380025 14.534538 2.610276 3.517710 6.572814 1.985816
min 6.000000 3.000000 59.700000 201.300000 22640.000000 8.270000e+02 3.200000 0.000000 22.300000 22.400000 ... 15.700000 13.500000 11.200000 2.600000 10.199155 0.000000 0.000000 0.000000 22.992490 0.000000
25% 76.000000 28.000000 161.200000 420.300000 38882.500000 1.168400e+04 12.150000 0.000000 37.700000 36.350000 ... 43.100000 34.500000 30.900000 14.850000 77.296180 0.620675 0.254199 0.295172 47.763063 4.521419
50% 171.000000 61.000000 178.100000 453.549422 45207.000000 2.664300e+04 15.900000 0.000000 41.000000 39.600000 ... 51.900000 41.100000 36.300000 18.800000 90.059774 2.247576 0.549812 0.826185 51.669941 5.381478
75% 518.000000 149.000000 195.200000 480.850000 52492.000000 6.867100e+04 20.400000 83.650776 44.000000 42.500000 ... 54.152642 47.700000 41.550000 23.100000 95.451693 10.509732 1.221037 2.177960 55.395132 6.493677
max 38150.000000 14010.000000 362.800000 1206.900000 125635.000000 1.017029e+07 47.400000 9762.308998 624.000000 64.700000 ... 78.900000 70.700000 65.100000 46.600000 100.000000 85.947799 42.619425 41.930251 78.075397 21.326165

8 rows × 31 columns

In [16]:
plt.figure(figsize=(20,10))
sns.pairplot(x_vars=['avgAnnCount','avgDeathsPerYear','incidenceRate','medIncome','popEst2015','povertyPercent','studyPerCap','binnedInc','MedianAge','MedianAgeMale','MedianAgeFemale','Geography','AvgHouseholdSize','PercentMarried','PctNoHS18_24','PctHS18_24','PctBachDeg18_24','PctHS25_Over','PctBachDeg25_Over','PctEmployed16_Over','PctUnemployed16_Over','PctPrivateCoverage','PctPrivateCoverageAlone','PctEmpPrivCoverage','PctPublicCoverage','PctPublicCoverageAlone','PctWhite','PctBlack','PctAsian','PctOtherRace','PctMarriedHouseholds','BirthRate'],y_vars=['TARGET_deathRate'],data=data,palette='hus1')
plt.show()
<Figure size 1440x720 with 0 Axes>

Visualising categorical value

In [17]:
plt.figure(figsize=(20,10))
sns.boxplot(x='Geography',y='TARGET_deathRate',data=data,palette='Blues')
Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x2229e9f2ac8>
In [15]:
data['Geography'].head(10)
Out[15]:
0          Kitsap County, Washington
1        Kittitas County, Washington
2       Klickitat County, Washington
3           Lewis County, Washington
4         Lincoln County, Washington
5           Mason County, Washington
6        Okanogan County, Washington
7         Pacific County, Washington
8    Pend Oreille County, Washington
9          Pierce County, Washington
Name: Geography, dtype: object

from here we can see all Geography is different but they has the state name as common so we can change the geogrphy to the state name

In [16]:
data[['first','Geography']] = data.Geography.str.split(", ",expand=True)
In [17]:
data.drop(['first'],axis=1,inplace=True)
In [18]:
data['Geography'].head()
Out[18]:
0    Washington
1    Washington
2    Washington
3    Washington
4    Washington
Name: Geography, dtype: object
In [22]:
plt.figure(figsize=(20,10))
sns.boxplot(x='Geography',y='TARGET_deathRate',data=data,palette='Blues')
Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x222f485cb00>
In [19]:
data.Geography.value_counts()
Out[19]:
Texas                   233
Georgia                 155
Virginia                125
Kentucky                120
Missouri                115
Illinois                102
Kansas                  102
North Carolina           99
Iowa                     99
Tennessee                95
Indiana                  92
Minnesota                87
Ohio                     87
Michigan                 83
Mississippi              82
Nebraska                 80
Oklahoma                 77
Arkansas                 75
Wisconsin                72
Pennsylvania             67
Florida                  66
Louisiana                64
Alabama                  63
New York                 62
Colorado                 60
South Dakota             59
California               57
West Virginia            55
North Dakota             51
Montana                  48
South Carolina           46
Idaho                    42
Washington               39
Oregon                   36
New Mexico               32
Utah                     27
Maryland                 24
Wyoming                  23
New Jersey               21
Alaska                   18
Nevada                   17
Maine                    16
Arizona                  15
Vermont                  14
Massachusetts            14
New Hampshire            10
Connecticut               8
Rhode Island              5
Hawaii                    4
Delaware                  3
District of Columbia      1
Name: Geography, dtype: int64
In [24]:
plt.figure(figsize=(20,10))
sns.boxplot(x='binnedInc',y='TARGET_deathRate',data=data,palette='Blues')
Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x222890e9d68>

Dummy variable

In [20]:
binned=pd.get_dummies(data['binnedInc'],drop_first=True)
binned.head()
binned.shape
Out[20]:
(3047, 9)
In [21]:
geo=pd.get_dummies(data['Geography'],drop_first=True)
geo.shape
Out[21]:
(3047, 50)
In [22]:
data=pd.concat([data,binned],axis=1)
data.head()
Out[22]:
avgAnnCount avgDeathsPerYear TARGET_deathRate incidenceRate medIncome popEst2015 povertyPercent studyPerCap binnedInc MedianAge ... BirthRate (37413.8, 40362.7] (40362.7, 42724.4] (42724.4, 45201] (45201, 48021.6] (48021.6, 51046.4] (51046.4, 54545.6] (54545.6, 61494.5] (61494.5, 125635] [22640, 34218.1]
0 1397.0 469 164.9 489.8 61898 260131 11.2 499.748204 (61494.5, 125635] 39.3 ... 6.118831 0 0 0 0 0 0 0 1 0
1 173.0 70 161.3 411.6 48127 43269 18.6 23.111234 (48021.6, 51046.4] 33.0 ... 4.333096 0 0 0 0 1 0 0 0 0
2 102.0 50 174.7 349.7 49348 21026 14.6 47.560164 (48021.6, 51046.4] 45.0 ... 3.729488 0 0 0 0 1 0 0 0 0
3 427.0 202 194.8 430.4 44243 75882 17.1 342.637253 (42724.4, 45201] 42.8 ... 4.603841 0 0 1 0 0 0 0 0 0
4 57.0 26 144.4 350.1 49955 10321 12.5 0.000000 (48021.6, 51046.4] 48.3 ... 6.796657 0 0 0 0 1 0 0 0 0

5 rows × 42 columns

In [23]:
data.drop(['binnedInc'],axis=1,inplace=True)
data.head()
Out[23]:
avgAnnCount avgDeathsPerYear TARGET_deathRate incidenceRate medIncome popEst2015 povertyPercent studyPerCap MedianAge MedianAgeMale ... BirthRate (37413.8, 40362.7] (40362.7, 42724.4] (42724.4, 45201] (45201, 48021.6] (48021.6, 51046.4] (51046.4, 54545.6] (54545.6, 61494.5] (61494.5, 125635] [22640, 34218.1]
0 1397.0 469 164.9 489.8 61898 260131 11.2 499.748204 39.3 36.9 ... 6.118831 0 0 0 0 0 0 0 1 0
1 173.0 70 161.3 411.6 48127 43269 18.6 23.111234 33.0 32.2 ... 4.333096 0 0 0 0 1 0 0 0 0
2 102.0 50 174.7 349.7 49348 21026 14.6 47.560164 45.0 44.0 ... 3.729488 0 0 0 0 1 0 0 0 0
3 427.0 202 194.8 430.4 44243 75882 17.1 342.637253 42.8 42.2 ... 4.603841 0 0 1 0 0 0 0 0 0
4 57.0 26 144.4 350.1 49955 10321 12.5 0.000000 48.3 47.8 ... 6.796657 0 0 0 0 1 0 0 0 0

5 rows × 41 columns

In [24]:
data=pd.concat([data,geo],axis=1)
data.head()
Out[24]:
avgAnnCount avgDeathsPerYear TARGET_deathRate incidenceRate medIncome popEst2015 povertyPercent studyPerCap MedianAge MedianAgeMale ... South Dakota Tennessee Texas Utah Vermont Virginia Washington West Virginia Wisconsin Wyoming
0 1397.0 469 164.9 489.8 61898 260131 11.2 499.748204 39.3 36.9 ... 0 0 0 0 0 0 1 0 0 0
1 173.0 70 161.3 411.6 48127 43269 18.6 23.111234 33.0 32.2 ... 0 0 0 0 0 0 1 0 0 0
2 102.0 50 174.7 349.7 49348 21026 14.6 47.560164 45.0 44.0 ... 0 0 0 0 0 0 1 0 0 0
3 427.0 202 194.8 430.4 44243 75882 17.1 342.637253 42.8 42.2 ... 0 0 0 0 0 0 1 0 0 0
4 57.0 26 144.4 350.1 49955 10321 12.5 0.000000 48.3 47.8 ... 0 0 0 0 0 0 1 0 0 0

5 rows × 91 columns

In [25]:
data.drop(['Geography'],axis=1,inplace=True)
data.head()
Out[25]:
avgAnnCount avgDeathsPerYear TARGET_deathRate incidenceRate medIncome popEst2015 povertyPercent studyPerCap MedianAge MedianAgeMale ... South Dakota Tennessee Texas Utah Vermont Virginia Washington West Virginia Wisconsin Wyoming
0 1397.0 469 164.9 489.8 61898 260131 11.2 499.748204 39.3 36.9 ... 0 0 0 0 0 0 1 0 0 0
1 173.0 70 161.3 411.6 48127 43269 18.6 23.111234 33.0 32.2 ... 0 0 0 0 0 0 1 0 0 0
2 102.0 50 174.7 349.7 49348 21026 14.6 47.560164 45.0 44.0 ... 0 0 0 0 0 0 1 0 0 0
3 427.0 202 194.8 430.4 44243 75882 17.1 342.637253 42.8 42.2 ... 0 0 0 0 0 0 1 0 0 0
4 57.0 26 144.4 350.1 49955 10321 12.5 0.000000 48.3 47.8 ... 0 0 0 0 0 0 1 0 0 0

5 rows × 90 columns

In [26]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3047 entries, 0 to 3046
Data columns (total 90 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   avgAnnCount              3047 non-null   float64
 1   avgDeathsPerYear         3047 non-null   int64  
 2   TARGET_deathRate         3047 non-null   float64
 3   incidenceRate            3047 non-null   float64
 4   medIncome                3047 non-null   int64  
 5   popEst2015               3047 non-null   int64  
 6   povertyPercent           3047 non-null   float64
 7   studyPerCap              3047 non-null   float64
 8   MedianAge                3047 non-null   float64
 9   MedianAgeMale            3047 non-null   float64
 10  MedianAgeFemale          3047 non-null   float64
 11  AvgHouseholdSize         3047 non-null   float64
 12  PercentMarried           3047 non-null   float64
 13  PctNoHS18_24             3047 non-null   float64
 14  PctHS18_24               3047 non-null   float64
 15  PctBachDeg18_24          3047 non-null   float64
 16  PctHS25_Over             3047 non-null   float64
 17  PctBachDeg25_Over        3047 non-null   float64
 18  PctEmployed16_Over       3047 non-null   float64
 19  PctUnemployed16_Over     3047 non-null   float64
 20  PctPrivateCoverage       3047 non-null   float64
 21  PctPrivateCoverageAlone  3047 non-null   float64
 22  PctEmpPrivCoverage       3047 non-null   float64
 23  PctPublicCoverage        3047 non-null   float64
 24  PctPublicCoverageAlone   3047 non-null   float64
 25  PctWhite                 3047 non-null   float64
 26  PctBlack                 3047 non-null   float64
 27  PctAsian                 3047 non-null   float64
 28  PctOtherRace             3047 non-null   float64
 29  PctMarriedHouseholds     3047 non-null   float64
 30  BirthRate                3047 non-null   float64
 31  (37413.8, 40362.7]       3047 non-null   uint8  
 32  (40362.7, 42724.4]       3047 non-null   uint8  
 33  (42724.4, 45201]         3047 non-null   uint8  
 34  (45201, 48021.6]         3047 non-null   uint8  
 35  (48021.6, 51046.4]       3047 non-null   uint8  
 36  (51046.4, 54545.6]       3047 non-null   uint8  
 37  (54545.6, 61494.5]       3047 non-null   uint8  
 38  (61494.5, 125635]        3047 non-null   uint8  
 39  [22640, 34218.1]         3047 non-null   uint8  
 40  Alaska                   3047 non-null   uint8  
 41  Arizona                  3047 non-null   uint8  
 42  Arkansas                 3047 non-null   uint8  
 43  California               3047 non-null   uint8  
 44  Colorado                 3047 non-null   uint8  
 45  Connecticut              3047 non-null   uint8  
 46  Delaware                 3047 non-null   uint8  
 47  District of Columbia     3047 non-null   uint8  
 48  Florida                  3047 non-null   uint8  
 49  Georgia                  3047 non-null   uint8  
 50  Hawaii                   3047 non-null   uint8  
 51  Idaho                    3047 non-null   uint8  
 52  Illinois                 3047 non-null   uint8  
 53  Indiana                  3047 non-null   uint8  
 54  Iowa                     3047 non-null   uint8  
 55  Kansas                   3047 non-null   uint8  
 56  Kentucky                 3047 non-null   uint8  
 57  Louisiana                3047 non-null   uint8  
 58  Maine                    3047 non-null   uint8  
 59  Maryland                 3047 non-null   uint8  
 60  Massachusetts            3047 non-null   uint8  
 61  Michigan                 3047 non-null   uint8  
 62  Minnesota                3047 non-null   uint8  
 63  Mississippi              3047 non-null   uint8  
 64  Missouri                 3047 non-null   uint8  
 65  Montana                  3047 non-null   uint8  
 66  Nebraska                 3047 non-null   uint8  
 67  Nevada                   3047 non-null   uint8  
 68  New Hampshire            3047 non-null   uint8  
 69  New Jersey               3047 non-null   uint8  
 70  New Mexico               3047 non-null   uint8  
 71  New York                 3047 non-null   uint8  
 72  North Carolina           3047 non-null   uint8  
 73  North Dakota             3047 non-null   uint8  
 74  Ohio                     3047 non-null   uint8  
 75  Oklahoma                 3047 non-null   uint8  
 76  Oregon                   3047 non-null   uint8  
 77  Pennsylvania             3047 non-null   uint8  
 78  Rhode Island             3047 non-null   uint8  
 79  South Carolina           3047 non-null   uint8  
 80  South Dakota             3047 non-null   uint8  
 81  Tennessee                3047 non-null   uint8  
 82  Texas                    3047 non-null   uint8  
 83  Utah                     3047 non-null   uint8  
 84  Vermont                  3047 non-null   uint8  
 85  Virginia                 3047 non-null   uint8  
 86  Washington               3047 non-null   uint8  
 87  West Virginia            3047 non-null   uint8  
 88  Wisconsin                3047 non-null   uint8  
 89  Wyoming                  3047 non-null   uint8  
dtypes: float64(28), int64(3), uint8(59)
memory usage: 913.6 KB

Spliting data into train and test

In [27]:
from sklearn.model_selection import train_test_split
df_train,df_test=train_test_split(data,train_size=0.75,test_size=0.25,random_state=100)
In [28]:
df_train.shape
Out[28]:
(2285, 90)
In [29]:
df_test.shape
Out[29]:
(762, 90)

Rescaling

In [30]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
In [31]:
convars=['avgAnnCount','avgDeathsPerYear','TARGET_deathRate','incidenceRate','medIncome','popEst2015','povertyPercent','studyPerCap','MedianAge','MedianAgeMale','MedianAgeFemale','AvgHouseholdSize','PercentMarried','PctNoHS18_24','PctHS18_24','PctBachDeg18_24','PctHS25_Over','PctBachDeg25_Over','PctEmployed16_Over','PctUnemployed16_Over','PctPrivateCoverage','PctPrivateCoverageAlone','PctEmpPrivCoverage','PctPublicCoverage','PctPublicCoverageAlone','PctWhite','PctBlack','PctAsian','PctOtherRace','PctMarriedHouseholds','BirthRate']
df_train[convars]=scaler.fit_transform(df_train[convars])
df_train.head()
C:\Users\kanan\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
C:\Users\kanan\AppData\Roaming\Python\Python36\site-packages\pandas\core\indexing.py:966: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
Out[31]:
avgAnnCount avgDeathsPerYear TARGET_deathRate incidenceRate medIncome popEst2015 povertyPercent studyPerCap MedianAge MedianAgeMale ... South Dakota Tennessee Texas Utah Vermont Virginia Washington West Virginia Wisconsin Wyoming
528 0.002228 0.003213 0.414244 0.193914 0.177638 0.001612 0.315789 0.000000 0.030081 0.397163 ... 0 0 0 0 0 0 0 0 0 0
16 0.036258 0.034269 0.299055 0.292959 0.390126 0.026377 0.187643 0.021282 0.026924 0.347518 ... 0 0 0 0 0 0 1 0 0 0
2664 0.002281 0.002642 0.338299 0.224344 0.124379 0.001736 0.526316 0.000000 0.027755 0.347518 ... 0 0 0 0 0 0 0 0 0 0
2907 0.004195 0.004569 0.379724 0.297136 0.156888 0.002285 0.395881 0.000000 0.033405 0.430260 ... 0 0 0 0 0 0 0 0 0 0
1793 0.004037 0.004284 0.260538 0.209626 0.245868 0.003856 0.281465 0.000000 0.022104 0.264775 ... 0 0 1 0 0 0 0 0 0 0

5 rows × 90 columns

divide it in x and y sets for model build

In [32]:
y_train=df_train.pop("TARGET_deathRate")
X_train=df_train

heatmap

In [38]:
plt.figure(figsize = (30, 10))
sns.heatmap(df_train.corr(), cmap="YlGnBu")
plt.show()

model building

using RFE

In [33]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
In [34]:
lm=LinearRegression()
lm.fit(X_train,y_train)
Out[34]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
In [35]:
rfe=RFE(lm,70)
rfe=rfe.fit(X_train,y_train)
In [36]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))
Out[36]:
[('avgAnnCount', True, 1),
 ('avgDeathsPerYear', True, 1),
 ('incidenceRate', True, 1),
 ('medIncome', True, 1),
 ('popEst2015', True, 1),
 ('povertyPercent', False, 8),
 ('studyPerCap', True, 1),
 ('MedianAge', False, 7),
 ('MedianAgeMale', False, 9),
 ('MedianAgeFemale', True, 1),
 ('AvgHouseholdSize', True, 1),
 ('PercentMarried', True, 1),
 ('PctNoHS18_24', True, 1),
 ('PctHS18_24', True, 1),
 ('PctBachDeg18_24', True, 1),
 ('PctHS25_Over', True, 1),
 ('PctBachDeg25_Over', True, 1),
 ('PctEmployed16_Over', True, 1),
 ('PctUnemployed16_Over', True, 1),
 ('PctPrivateCoverage', True, 1),
 ('PctPrivateCoverageAlone', True, 1),
 ('PctEmpPrivCoverage', False, 16),
 ('PctPublicCoverage', True, 1),
 ('PctPublicCoverageAlone', True, 1),
 ('PctWhite', True, 1),
 ('PctBlack', True, 1),
 ('PctAsian', False, 4),
 ('PctOtherRace', True, 1),
 ('PctMarriedHouseholds', True, 1),
 ('BirthRate', True, 1),
 ('(37413.8, 40362.7]', False, 10),
 ('(40362.7, 42724.4]', False, 2),
 ('(42724.4, 45201]', False, 6),
 ('(45201, 48021.6]', True, 1),
 ('(48021.6, 51046.4]', True, 1),
 ('(51046.4, 54545.6]', True, 1),
 ('(54545.6, 61494.5]', False, 3),
 ('(61494.5, 125635]', False, 5),
 ('[22640, 34218.1]', True, 1),
 ('Alaska', True, 1),
 ('Arizona', True, 1),
 ('Arkansas', True, 1),
 ('California', True, 1),
 ('Colorado', True, 1),
 ('Connecticut', True, 1),
 ('Delaware', True, 1),
 ('District of Columbia', False, 19),
 ('Florida', False, 17),
 ('Georgia', True, 1),
 ('Hawaii', True, 1),
 ('Idaho', True, 1),
 ('Illinois', True, 1),
 ('Indiana', True, 1),
 ('Iowa', True, 1),
 ('Kansas', True, 1),
 ('Kentucky', True, 1),
 ('Louisiana', False, 18),
 ('Maine', False, 15),
 ('Maryland', False, 11),
 ('Massachusetts', True, 1),
 ('Michigan', True, 1),
 ('Minnesota', False, 20),
 ('Mississippi', True, 1),
 ('Missouri', True, 1),
 ('Montana', True, 1),
 ('Nebraska', True, 1),
 ('Nevada', True, 1),
 ('New Hampshire', True, 1),
 ('New Jersey', True, 1),
 ('New Mexico', True, 1),
 ('New York', True, 1),
 ('North Carolina', True, 1),
 ('North Dakota', True, 1),
 ('Ohio', False, 13),
 ('Oklahoma', True, 1),
 ('Oregon', True, 1),
 ('Pennsylvania', True, 1),
 ('Rhode Island', True, 1),
 ('South Carolina', False, 14),
 ('South Dakota', True, 1),
 ('Tennessee', True, 1),
 ('Texas', True, 1),
 ('Utah', True, 1),
 ('Vermont', True, 1),
 ('Virginia', True, 1),
 ('Washington', True, 1),
 ('West Virginia', False, 12),
 ('Wisconsin', True, 1),
 ('Wyoming', True, 1)]
In [37]:
col=X_train.columns[rfe.support_]
In [38]:
col
Out[38]:
Index(['avgAnnCount', 'avgDeathsPerYear', 'incidenceRate', 'medIncome',
       'popEst2015', 'studyPerCap', 'MedianAgeFemale', 'AvgHouseholdSize',
       'PercentMarried', 'PctNoHS18_24', 'PctHS18_24', 'PctBachDeg18_24',
       'PctHS25_Over', 'PctBachDeg25_Over', 'PctEmployed16_Over',
       'PctUnemployed16_Over', 'PctPrivateCoverage', 'PctPrivateCoverageAlone',
       'PctPublicCoverage', 'PctPublicCoverageAlone', 'PctWhite', 'PctBlack',
       'PctOtherRace', 'PctMarriedHouseholds', 'BirthRate', '(45201, 48021.6]',
       '(48021.6, 51046.4]', '(51046.4, 54545.6]', '[22640, 34218.1]',
       'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado',
       'Connecticut', 'Delaware', 'Georgia', 'Hawaii', 'Idaho', 'Illinois',
       'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Massachusetts', 'Michigan',
       'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
       'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
       'North Carolina', 'North Dakota', 'Oklahoma', 'Oregon', 'Pennsylvania',
       'Rhode Island', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',
       'Virginia', 'Washington', 'Wisconsin', 'Wyoming'],
      dtype='object')

mmodel building using statsmodel

In [39]:
X_train_rfe=X_train[col]
In [40]:
import statsmodels.api as sm
X_train_rfe=sm.add_constant(X_train_rfe)
In [41]:
lm=sm.OLS(y_train,X_train_rfe).fit()
In [42]:
print(lm.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:       TARGET_deathRate   R-squared:                       0.591
Model:                            OLS   Adj. R-squared:                  0.578
Method:                 Least Squares   F-statistic:                     45.62
Date:                Wed, 12 Aug 2020   Prob (F-statistic):               0.00
Time:                        21:21:00   Log-Likelihood:                 3047.8
No. Observations:                2285   AIC:                            -5954.
Df Residuals:                    2214   BIC:                            -5546.
Df Model:                          70                                         
Covariance Type:            nonrobust                                         
===========================================================================================
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                       0.2544      0.045      5.671      0.000       0.166       0.342
avgAnnCount                -1.1608      0.167     -6.965      0.000      -1.488      -0.834
avgDeathsPerYear            1.5295      0.236      6.476      0.000       1.066       1.993
incidenceRate               0.6753      0.030     22.263      0.000       0.616       0.735
medIncome                   0.0362      0.030      1.220      0.222      -0.022       0.094
popEst2015                 -0.4678      0.225     -2.083      0.037      -0.908      -0.027
studyPerCap                 0.0204      0.026      0.789      0.430      -0.030       0.071
MedianAgeFemale            -0.0562      0.026     -2.139      0.033      -0.108      -0.005
AvgHouseholdSize            0.0217      0.016      1.371      0.170      -0.009       0.053
PercentMarried              0.1030      0.032      3.190      0.001       0.040       0.166
PctNoHS18_24                0.0077      0.015      0.520      0.603      -0.021       0.037
PctHS18_24                  0.0369      0.013      2.849      0.004       0.011       0.062
PctBachDeg18_24            -0.0441      0.023     -1.949      0.051      -0.088       0.000
PctHS25_Over                0.0443      0.020      2.215      0.027       0.005       0.084
PctBachDeg25_Over          -0.1388      0.025     -5.521      0.000      -0.188      -0.089
PctEmployed16_Over         -0.0451      0.025     -1.821      0.069      -0.094       0.003
PctUnemployed16_Over        0.0553      0.020      2.699      0.007       0.015       0.096
PctPrivateCoverage          0.0158      0.035      0.458      0.647      -0.052       0.084
PctPrivateCoverageAlone     0.0267      0.018      1.460      0.144      -0.009       0.062
PctPublicCoverage          -0.0211      0.048     -0.441      0.659      -0.115       0.073
PctPublicCoverageAlone      0.0932      0.042      2.214      0.027       0.011       0.176
PctWhite                   -0.0613      0.023     -2.646      0.008      -0.107      -0.016
PctBlack                   -0.0791      0.023     -3.405      0.001      -0.125      -0.034
PctOtherRace               -0.0942      0.022     -4.346      0.000      -0.137      -0.052
PctMarriedHouseholds       -0.1863      0.035     -5.337      0.000      -0.255      -0.118
BirthRate                  -0.0492      0.016     -3.054      0.002      -0.081      -0.018
(45201, 48021.6]           -0.0058      0.005     -1.220      0.223      -0.015       0.004
(48021.6, 51046.4]         -0.0094      0.005     -1.921      0.055      -0.019       0.000
(51046.4, 54545.6]         -0.0083      0.005     -1.701      0.089      -0.018       0.001
[22640, 34218.1]            0.0232      0.006      3.998      0.000       0.012       0.035
Alaska                      0.0484      0.022      2.248      0.025       0.006       0.091
Arizona                    -0.0659      0.021     -3.156      0.002      -0.107      -0.025
Arkansas                    0.0377      0.010      3.790      0.000       0.018       0.057
California                 -0.0552      0.012     -4.470      0.000      -0.079      -0.031
Colorado                   -0.0661      0.012     -5.638      0.000      -0.089      -0.043
Connecticut                -0.0655      0.030     -2.207      0.027      -0.124      -0.007
Delaware                   -0.0403      0.046     -0.869      0.385      -0.131       0.051
Georgia                    -0.0226      0.008     -2.987      0.003      -0.037      -0.008
Hawaii                     -0.1353      0.041     -3.288      0.001      -0.216      -0.055
Idaho                      -0.0588      0.013     -4.451      0.000      -0.085      -0.033
Illinois                   -0.0155      0.009     -1.778      0.076      -0.033       0.002
Indiana                     0.0162      0.009      1.748      0.081      -0.002       0.034
Iowa                       -0.0397      0.009     -4.384      0.000      -0.057      -0.022
Kansas                      0.0328      0.011      3.028      0.002       0.012       0.054
Kentucky                    0.0307      0.009      3.611      0.000       0.014       0.047
Massachusetts              -0.0519      0.021     -2.473      0.013      -0.093      -0.011
Michigan                   -0.0135      0.010     -1.397      0.163      -0.032       0.005
Mississippi                 0.0249      0.010      2.512      0.012       0.005       0.044
Missouri                    0.0142      0.008      1.684      0.092      -0.002       0.031
Montana                    -0.0644      0.013     -4.916      0.000      -0.090      -0.039
Nebraska                   -0.0295      0.010     -2.829      0.005      -0.050      -0.009
Nevada                     -0.0238      0.022     -1.104      0.270      -0.066       0.019
New Hampshire              -0.0195      0.024     -0.831      0.406      -0.066       0.027
New Jersey                 -0.0268      0.017     -1.536      0.125      -0.061       0.007
New Mexico                 -0.0715      0.015     -4.639      0.000      -0.102      -0.041
New York                   -0.0556      0.011     -5.123      0.000      -0.077      -0.034
North Carolina             -0.0239      0.009     -2.674      0.008      -0.041      -0.006
North Dakota               -0.0193      0.013     -1.542      0.123      -0.044       0.005
Oklahoma                    0.0329      0.010      3.174      0.002       0.013       0.053
Oregon                     -0.0419      0.013     -3.167      0.002      -0.068      -0.016
Pennsylvania               -0.0450      0.011     -4.234      0.000      -0.066      -0.024
Rhode Island               -0.0398      0.033     -1.208      0.227      -0.105       0.025
South Dakota               -0.0337      0.012     -2.873      0.004      -0.057      -0.011
Tennessee                   0.0229      0.009      2.611      0.009       0.006       0.040
Texas                       0.0078      0.008      1.032      0.302      -0.007       0.023
Utah                       -0.1007      0.018     -5.720      0.000      -0.135      -0.066
Vermont                    -0.0185      0.021     -0.886      0.376      -0.060       0.022
Virginia                    0.0175      0.008      2.138      0.033       0.001       0.034
Washington                 -0.0336      0.013     -2.503      0.012      -0.060      -0.007
Wisconsin                  -0.0179      0.010     -1.823      0.068      -0.037       0.001
Wyoming                    -0.0183      0.016     -1.113      0.266      -0.051       0.014
==============================================================================
Omnibus:                      200.242   Durbin-Watson:                   2.006
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1227.867
Skew:                           0.109   Prob(JB):                    2.36e-267
Kurtosis:                       6.585   Cond. No.                         521.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

PctNoHS18_24 0.603
PctPrivateCoverage 0.647
PctPublicCoverage 0.659 -> is insignificant in presence of other variables

In [43]:
X_train_2=X_train_rfe.drop(['PctPublicCoverage','PctNoHS18_24','PctPrivateCoverage'],axis=1)
X_train_2.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2285 entries, 528 to 1544
Data columns (total 68 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   const                    2285 non-null   float64
 1   avgAnnCount              2285 non-null   float64
 2   avgDeathsPerYear         2285 non-null   float64
 3   incidenceRate            2285 non-null   float64
 4   medIncome                2285 non-null   float64
 5   popEst2015               2285 non-null   float64
 6   studyPerCap              2285 non-null   float64
 7   MedianAgeFemale          2285 non-null   float64
 8   AvgHouseholdSize         2285 non-null   float64
 9   PercentMarried           2285 non-null   float64
 10  PctHS18_24               2285 non-null   float64
 11  PctBachDeg18_24          2285 non-null   float64
 12  PctHS25_Over             2285 non-null   float64
 13  PctBachDeg25_Over        2285 non-null   float64
 14  PctEmployed16_Over       2285 non-null   float64
 15  PctUnemployed16_Over     2285 non-null   float64
 16  PctPrivateCoverageAlone  2285 non-null   float64
 17  PctPublicCoverageAlone   2285 non-null   float64
 18  PctWhite                 2285 non-null   float64
 19  PctBlack                 2285 non-null   float64
 20  PctOtherRace             2285 non-null   float64
 21  PctMarriedHouseholds     2285 non-null   float64
 22  BirthRate                2285 non-null   float64
 23  (45201, 48021.6]         2285 non-null   uint8  
 24  (48021.6, 51046.4]       2285 non-null   uint8  
 25  (51046.4, 54545.6]       2285 non-null   uint8  
 26  [22640, 34218.1]         2285 non-null   uint8  
 27  Alaska                   2285 non-null   uint8  
 28  Arizona                  2285 non-null   uint8  
 29  Arkansas                 2285 non-null   uint8  
 30  California               2285 non-null   uint8  
 31  Colorado                 2285 non-null   uint8  
 32  Connecticut              2285 non-null   uint8  
 33  Delaware                 2285 non-null   uint8  
 34  Georgia                  2285 non-null   uint8  
 35  Hawaii                   2285 non-null   uint8  
 36  Idaho                    2285 non-null   uint8  
 37  Illinois                 2285 non-null   uint8  
 38  Indiana                  2285 non-null   uint8  
 39  Iowa                     2285 non-null   uint8  
 40  Kansas                   2285 non-null   uint8  
 41  Kentucky                 2285 non-null   uint8  
 42  Massachusetts            2285 non-null   uint8  
 43  Michigan                 2285 non-null   uint8  
 44  Mississippi              2285 non-null   uint8  
 45  Missouri                 2285 non-null   uint8  
 46  Montana                  2285 non-null   uint8  
 47  Nebraska                 2285 non-null   uint8  
 48  Nevada                   2285 non-null   uint8  
 49  New Hampshire            2285 non-null   uint8  
 50  New Jersey               2285 non-null   uint8  
 51  New Mexico               2285 non-null   uint8  
 52  New York                 2285 non-null   uint8  
 53  North Carolina           2285 non-null   uint8  
 54  North Dakota             2285 non-null   uint8  
 55  Oklahoma                 2285 non-null   uint8  
 56  Oregon                   2285 non-null   uint8  
 57  Pennsylvania             2285 non-null   uint8  
 58  Rhode Island             2285 non-null   uint8  
 59  South Dakota             2285 non-null   uint8  
 60  Tennessee                2285 non-null   uint8  
 61  Texas                    2285 non-null   uint8  
 62  Utah                     2285 non-null   uint8  
 63  Vermont                  2285 non-null   uint8  
 64  Virginia                 2285 non-null   uint8  
 65  Washington               2285 non-null   uint8  
 66  Wisconsin                2285 non-null   uint8  
 67  Wyoming                  2285 non-null   uint8  
dtypes: float64(23), uint8(45)
memory usage: 528.9 KB

rebuilding

In [44]:
X_train_2=sm.add_constant(X_train_2)
In [45]:
lm2=sm.OLS(y_train,X_train_2).fit()
In [46]:
print(lm2.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:       TARGET_deathRate   R-squared:                       0.590
Model:                            OLS   Adj. R-squared:                  0.578
Method:                 Least Squares   F-statistic:                     47.71
Date:                Wed, 12 Aug 2020   Prob (F-statistic):               0.00
Time:                        21:21:00   Log-Likelihood:                 3047.5
No. Observations:                2285   AIC:                            -5959.
Df Residuals:                    2217   BIC:                            -5569.
Df Model:                          67                                         
Covariance Type:            nonrobust                                         
===========================================================================================
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                       0.2602      0.039      6.625      0.000       0.183       0.337
avgAnnCount                -1.1597      0.163     -7.096      0.000      -1.480      -0.839
avgDeathsPerYear            1.5344      0.234      6.547      0.000       1.075       1.994
incidenceRate               0.6751      0.030     22.450      0.000       0.616       0.734
medIncome                   0.0439      0.028      1.592      0.112      -0.010       0.098
popEst2015                 -0.4740      0.224     -2.116      0.034      -0.913      -0.035
studyPerCap                 0.0201      0.026      0.778      0.437      -0.031       0.071
MedianAgeFemale            -0.0622      0.019     -3.226      0.001      -0.100      -0.024
AvgHouseholdSize            0.0216      0.016      1.370      0.171      -0.009       0.053
PercentMarried              0.1002      0.032      3.170      0.002       0.038       0.162
PctHS18_24                  0.0354      0.013      2.821      0.005       0.011       0.060
PctBachDeg18_24            -0.0451      0.022     -2.014      0.044      -0.089      -0.001
PctHS25_Over                0.0464      0.020      2.362      0.018       0.008       0.085
PctBachDeg25_Over          -0.1383      0.024     -5.647      0.000      -0.186      -0.090
PctEmployed16_Over         -0.0428      0.023     -1.855      0.064      -0.088       0.002
PctUnemployed16_Over        0.0534      0.020      2.632      0.009       0.014       0.093
PctPrivateCoverageAlone     0.0300      0.017      1.816      0.069      -0.002       0.062
PctPublicCoverageAlone      0.0737      0.021      3.507      0.000       0.032       0.115
PctWhite                   -0.0605      0.023     -2.680      0.007      -0.105      -0.016
PctBlack                   -0.0769      0.023     -3.390      0.001      -0.121      -0.032
PctOtherRace               -0.0926      0.022     -4.299      0.000      -0.135      -0.050
PctMarriedHouseholds       -0.1820      0.034     -5.333      0.000      -0.249      -0.115
BirthRate                  -0.0492      0.016     -3.061      0.002      -0.081      -0.018
(45201, 48021.6]           -0.0057      0.005     -1.199      0.231      -0.015       0.004
(48021.6, 51046.4]         -0.0094      0.005     -1.919      0.055      -0.019       0.000
(51046.4, 54545.6]         -0.0082      0.005     -1.684      0.092      -0.018       0.001
[22640, 34218.1]            0.0231      0.006      4.002      0.000       0.012       0.034
Alaska                      0.0472      0.021      2.204      0.028       0.005       0.089
Arizona                    -0.0659      0.021     -3.165      0.002      -0.107      -0.025
Arkansas                    0.0367      0.010      3.733      0.000       0.017       0.056
California                 -0.0557      0.012     -4.539      0.000      -0.080      -0.032
Colorado                   -0.0655      0.012     -5.654      0.000      -0.088      -0.043
Connecticut                -0.0657      0.030     -2.218      0.027      -0.124      -0.008
Delaware                   -0.0414      0.046     -0.895      0.371      -0.132       0.049
Georgia                    -0.0223      0.007     -3.078      0.002      -0.037      -0.008
Hawaii                     -0.1348      0.041     -3.308      0.001      -0.215      -0.055
Idaho                      -0.0588      0.013     -4.467      0.000      -0.085      -0.033
Illinois                   -0.0153      0.009     -1.779      0.075      -0.032       0.002
Indiana                     0.0170      0.009      1.859      0.063      -0.001       0.035
Iowa                       -0.0398      0.009     -4.484      0.000      -0.057      -0.022
Kansas                      0.0334      0.011      3.112      0.002       0.012       0.054
Kentucky                    0.0311      0.008      3.685      0.000       0.015       0.048
Massachusetts              -0.0523      0.021     -2.519      0.012      -0.093      -0.012
Michigan                   -0.0137      0.009     -1.446      0.148      -0.032       0.005
Mississippi                 0.0245      0.010      2.484      0.013       0.005       0.044
Missouri                    0.0143      0.008      1.701      0.089      -0.002       0.031
Montana                    -0.0638      0.013     -4.932      0.000      -0.089      -0.038
Nebraska                   -0.0290      0.010     -2.790      0.005      -0.049      -0.009
Nevada                     -0.0233      0.021     -1.085      0.278      -0.065       0.019
New Hampshire              -0.0197      0.023     -0.839      0.401      -0.066       0.026
New Jersey                 -0.0273      0.017     -1.570      0.117      -0.061       0.007
New Mexico                 -0.0717      0.015     -4.681      0.000      -0.102      -0.042
New York                   -0.0558      0.011     -5.194      0.000      -0.077      -0.035
North Carolina             -0.0237      0.009     -2.667      0.008      -0.041      -0.006
North Dakota               -0.0185      0.012     -1.483      0.138      -0.043       0.006
Oklahoma                    0.0328      0.010      3.181      0.001       0.013       0.053
Oregon                     -0.0420      0.013     -3.192      0.001      -0.068      -0.016
Pennsylvania               -0.0453      0.011     -4.280      0.000      -0.066      -0.025
Rhode Island               -0.0400      0.033     -1.213      0.225      -0.105       0.025
South Dakota               -0.0325      0.012     -2.816      0.005      -0.055      -0.010
Tennessee                   0.0228      0.009      2.614      0.009       0.006       0.040
Texas                       0.0072      0.007      1.012      0.312      -0.007       0.021
Utah                       -0.1007      0.018     -5.746      0.000      -0.135      -0.066
Vermont                    -0.0190      0.021     -0.915      0.360      -0.060       0.022
Virginia                    0.0175      0.008      2.166      0.030       0.002       0.033
Washington                 -0.0336      0.013     -2.521      0.012      -0.060      -0.007
Wisconsin                  -0.0178      0.010     -1.827      0.068      -0.037       0.001
Wyoming                    -0.0185      0.016     -1.128      0.259      -0.051       0.014
==============================================================================
Omnibus:                      200.005   Durbin-Watson:                   2.006
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1225.834
Skew:                           0.108   Prob(JB):                    6.51e-267
Kurtosis:                       6.582   Cond. No.                         485.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

studyPerCap 0.437
Delaware 0.371
New Hampshire 0.401
Texas 0.312
Vermont 0.360
is insignificant in presence of other variables

In [47]:
X_train_3=X_train_2.drop(['studyPerCap','Delaware','New Hampshire','Texas','Vermont'],axis=1)
X_train_3=sm.add_constant(X_train_3)
lm3=sm.OLS(y_train,X_train_3).fit()
print(lm3.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:       TARGET_deathRate   R-squared:                       0.590
Model:                            OLS   Adj. R-squared:                  0.578
Method:                 Least Squares   F-statistic:                     51.50
Date:                Wed, 12 Aug 2020   Prob (F-statistic):               0.00
Time:                        21:21:01   Log-Likelihood:                 3045.4
No. Observations:                2285   AIC:                            -5965.
Df Residuals:                    2222   BIC:                            -5603.
Df Model:                          62                                         
Covariance Type:            nonrobust                                         
===========================================================================================
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                       0.2774      0.037      7.470      0.000       0.205       0.350
avgAnnCount                -1.1685      0.159     -7.356      0.000      -1.480      -0.857
avgDeathsPerYear            1.5349      0.234      6.566      0.000       1.076       1.993
incidenceRate               0.6686      0.030     22.557      0.000       0.610       0.727
medIncome                   0.0398      0.027      1.449      0.147      -0.014       0.094
popEst2015                 -0.4633      0.223     -2.081      0.038      -0.900      -0.027
MedianAgeFemale            -0.0660      0.019     -3.451      0.001      -0.103      -0.028
AvgHouseholdSize            0.0233      0.016      1.483      0.138      -0.008       0.054
PercentMarried              0.1043      0.031      3.324      0.001       0.043       0.166
PctHS18_24                  0.0369      0.012      2.955      0.003       0.012       0.061
PctBachDeg18_24            -0.0481      0.022     -2.157      0.031      -0.092      -0.004
PctHS25_Over                0.0399      0.019      2.117      0.034       0.003       0.077
PctBachDeg25_Over          -0.1440      0.024     -6.003      0.000      -0.191      -0.097
PctEmployed16_Over         -0.0479      0.023     -2.104      0.036      -0.093      -0.003
PctUnemployed16_Over        0.0508      0.020      2.543      0.011       0.012       0.090
PctPrivateCoverageAlone     0.0284      0.016      1.728      0.084      -0.004       0.061
PctPublicCoverageAlone      0.0658      0.020      3.240      0.001       0.026       0.106
PctWhite                   -0.0616      0.023     -2.737      0.006      -0.106      -0.017
PctBlack                   -0.0775      0.023     -3.434      0.001      -0.122      -0.033
PctOtherRace               -0.0893      0.021     -4.187      0.000      -0.131      -0.047
PctMarriedHouseholds       -0.1849      0.034     -5.430      0.000      -0.252      -0.118
BirthRate                  -0.0479      0.016     -2.989      0.003      -0.079      -0.016
(45201, 48021.6]           -0.0058      0.005     -1.219      0.223      -0.015       0.004
(48021.6, 51046.4]         -0.0094      0.005     -1.920      0.055      -0.019       0.000
(51046.4, 54545.6]         -0.0085      0.005     -1.754      0.080      -0.018       0.001
[22640, 34218.1]            0.0230      0.006      3.984      0.000       0.012       0.034
Alaska                      0.0449      0.021      2.120      0.034       0.003       0.086
Arizona                    -0.0692      0.020     -3.384      0.001      -0.109      -0.029
Arkansas                    0.0358      0.010      3.719      0.000       0.017       0.055
California                 -0.0579      0.012     -4.934      0.000      -0.081      -0.035
Colorado                   -0.0667      0.011     -5.974      0.000      -0.089      -0.045
Connecticut                -0.0646      0.030     -2.185      0.029      -0.123      -0.007
Georgia                    -0.0242      0.007     -3.489      0.000      -0.038      -0.011
Hawaii                     -0.1357      0.041     -3.343      0.001      -0.215      -0.056
Idaho                      -0.0621      0.013     -4.914      0.000      -0.087      -0.037
Illinois                   -0.0151      0.008     -1.806      0.071      -0.032       0.001
Indiana                     0.0160      0.009      1.794      0.073      -0.001       0.034
Iowa                       -0.0401      0.009     -4.669      0.000      -0.057      -0.023
Kansas                      0.0326      0.011      3.077      0.002       0.012       0.053
Kentucky                    0.0304      0.008      3.710      0.000       0.014       0.046
Massachusetts              -0.0508      0.021     -2.466      0.014      -0.091      -0.010
Michigan                   -0.0145      0.009     -1.577      0.115      -0.033       0.004
Mississippi                 0.0228      0.010      2.357      0.019       0.004       0.042
Missouri                    0.0131      0.008      1.611      0.107      -0.003       0.029
Montana                    -0.0650      0.013     -5.155      0.000      -0.090      -0.040
Nebraska                   -0.0301      0.010     -3.015      0.003      -0.050      -0.011
Nevada                     -0.0259      0.021     -1.217      0.224      -0.068       0.016
New Jersey                 -0.0265      0.017     -1.529      0.126      -0.060       0.007
New Mexico                 -0.0743      0.015     -5.036      0.000      -0.103      -0.045
New York                   -0.0553      0.011     -5.216      0.000      -0.076      -0.034
North Carolina             -0.0254      0.009     -2.974      0.003      -0.042      -0.009
North Dakota               -0.0205      0.012     -1.705      0.088      -0.044       0.003
Oklahoma                    0.0305      0.010      3.094      0.002       0.011       0.050
Oregon                     -0.0438      0.013     -3.420      0.001      -0.069      -0.019
Pennsylvania               -0.0449      0.011     -4.274      0.000      -0.065      -0.024
Rhode Island               -0.0400      0.033     -1.217      0.224      -0.104       0.024
South Dakota               -0.0339      0.011     -3.025      0.003      -0.056      -0.012
Tennessee                   0.0219      0.008      2.583      0.010       0.005       0.039
Utah                       -0.1046      0.017     -6.123      0.000      -0.138      -0.071
Virginia                    0.0160      0.008      2.070      0.039       0.001       0.031
Washington                 -0.0357      0.013     -2.776      0.006      -0.061      -0.010
Wisconsin                  -0.0177      0.009     -1.875      0.061      -0.036       0.001
Wyoming                    -0.0209      0.016     -1.304      0.192      -0.052       0.011
==============================================================================
Omnibus:                      198.895   Durbin-Watson:                   2.005
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1205.909
Skew:                           0.112   Prob(JB):                    1.38e-262
Kurtosis:                       6.552   Cond. No.                         484.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

medIncome 0.147
AvgHouseholdSize 0.138
AvgHouseholdSize 0.084
(45201, 48021.6] 0.223
(48021.6, 51046.4] 0.055
(51046.4, 54545.6] 0.080
Illinois 0.071
Indiana 0.073
Michigan 0.115
Missouri 0.107
Nevada 0.224
New Jersey 0.126
North Dakota 0.088
Rhode Island 0.224
Wisconsin 0.061
Wyoming 0.192
is insignificant in presence of other variables

In [48]:
X_train_4=X_train_3.drop(['medIncome','AvgHouseholdSize','AvgHouseholdSize','PctPrivateCoverageAlone','(45201, 48021.6]','(48021.6, 51046.4]','(51046.4, 54545.6]','Illinois','Indiana','Michigan','Missouri','Nevada','New Jersey','North Dakota','Rhode Island','Wisconsin','Wyoming'],axis=1)
X_train_4=sm.add_constant(X_train_4)
lm4=sm.OLS(y_train,X_train_4).fit()
print(lm4.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:       TARGET_deathRate   R-squared:                       0.582
Model:                            OLS   Adj. R-squared:                  0.574
Method:                 Least Squares   F-statistic:                     67.86
Date:                Wed, 12 Aug 2020   Prob (F-statistic):               0.00
Time:                        21:21:01   Log-Likelihood:                 3025.4
No. Observations:                2285   AIC:                            -5957.
Df Residuals:                    2238   BIC:                            -5687.
Df Model:                          46                                         
Covariance Type:            nonrobust                                         
==========================================================================================
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                      0.2888      0.034      8.532      0.000       0.222       0.355
avgAnnCount               -1.1189      0.151     -7.406      0.000      -1.415      -0.823
avgDeathsPerYear           1.4851      0.231      6.426      0.000       1.032       1.938
incidenceRate              0.6692      0.029     23.104      0.000       0.612       0.726
popEst2015                -0.4490      0.222     -2.020      0.044      -0.885      -0.013
MedianAgeFemale           -0.0860      0.019     -4.647      0.000      -0.122      -0.050
PercentMarried             0.0815      0.030      2.697      0.007       0.022       0.141
PctHS18_24                 0.0395      0.012      3.232      0.001       0.016       0.063
PctBachDeg18_24           -0.0404      0.022     -1.840      0.066      -0.083       0.003
PctHS25_Over               0.0529      0.018      2.933      0.003       0.018       0.088
PctBachDeg25_Over         -0.1261      0.022     -5.605      0.000      -0.170      -0.082
PctEmployed16_Over        -0.0390      0.021     -1.837      0.066      -0.081       0.003
PctUnemployed16_Over       0.0542      0.019      2.813      0.005       0.016       0.092
PctPublicCoverageAlone     0.0526      0.017      3.020      0.003       0.018       0.087
PctWhite                  -0.0579      0.021     -2.706      0.007      -0.100      -0.016
PctBlack                  -0.0649      0.022     -3.017      0.003      -0.107      -0.023
PctOtherRace              -0.0879      0.021     -4.204      0.000      -0.129      -0.047
PctMarriedHouseholds      -0.1342      0.029     -4.565      0.000      -0.192      -0.077
BirthRate                 -0.0534      0.016     -3.371      0.001      -0.084      -0.022
[22640, 34218.1]           0.0242      0.006      4.245      0.000       0.013       0.035
Alaska                     0.0562      0.021      2.711      0.007       0.016       0.097
Arizona                   -0.0665      0.020     -3.265      0.001      -0.106      -0.027
Arkansas                   0.0369      0.010      3.867      0.000       0.018       0.056
California                -0.0485      0.011     -4.257      0.000      -0.071      -0.026
Colorado                  -0.0599      0.011     -5.463      0.000      -0.081      -0.038
Connecticut               -0.0505      0.029     -1.716      0.086      -0.108       0.007
Georgia                   -0.0242      0.007     -3.538      0.000      -0.038      -0.011
Hawaii                    -0.1182      0.040     -2.923      0.003      -0.197      -0.039
Idaho                     -0.0618      0.012     -4.976      0.000      -0.086      -0.037
Iowa                      -0.0380      0.008     -4.724      0.000      -0.054      -0.022
Kansas                     0.0329      0.010      3.180      0.001       0.013       0.053
Kentucky                   0.0335      0.008      4.241      0.000       0.018       0.049
Massachusetts             -0.0385      0.020     -1.892      0.059      -0.078       0.001
Mississippi                0.0228      0.010      2.356      0.019       0.004       0.042
Montana                   -0.0633      0.012     -5.170      0.000      -0.087      -0.039
Nebraska                  -0.0268      0.009     -2.836      0.005      -0.045      -0.008
New Mexico                -0.0664      0.015     -4.519      0.000      -0.095      -0.038
New York                  -0.0514      0.010     -5.072      0.000      -0.071      -0.032
North Carolina            -0.0232      0.008     -2.768      0.006      -0.040      -0.007
Oklahoma                   0.0314      0.010      3.291      0.001       0.013       0.050
Oregon                    -0.0385      0.013     -3.041      0.002      -0.063      -0.014
Pennsylvania              -0.0431      0.010     -4.237      0.000      -0.063      -0.023
South Dakota              -0.0313      0.011     -2.946      0.003      -0.052      -0.010
Tennessee                  0.0235      0.008      2.846      0.004       0.007       0.040
Utah                      -0.1015      0.017     -5.981      0.000      -0.135      -0.068
Virginia                   0.0211      0.008      2.808      0.005       0.006       0.036
Washington                -0.0307      0.013     -2.422      0.016      -0.056      -0.006
==============================================================================
Omnibus:                      198.150   Durbin-Watson:                   2.001
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1223.700
Skew:                           0.082   Prob(JB):                    1.89e-266
Kurtosis:                       6.581   Cond. No.                         438.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

PctBachDeg18_24 0.066
PctEmployed16_Over 0.066
is insignificant in presence of other variables

In [49]:
X_train_5=X_train_4.drop(['PctBachDeg18_24','PctEmployed16_Over'],axis=1)
X_train_5=sm.add_constant(X_train_5)
lm5=sm.OLS(y_train,X_train_5).fit()
print(lm5.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:       TARGET_deathRate   R-squared:                       0.581
Model:                            OLS   Adj. R-squared:                  0.573
Method:                 Least Squares   F-statistic:                     70.61
Date:                Wed, 12 Aug 2020   Prob (F-statistic):               0.00
Time:                        21:21:02   Log-Likelihood:                 3021.6
No. Observations:                2285   AIC:                            -5953.
Df Residuals:                    2240   BIC:                            -5695.
Df Model:                          44                                         
Covariance Type:            nonrobust                                         
==========================================================================================
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                      0.2541      0.030      8.408      0.000       0.195       0.313
avgAnnCount               -1.1480      0.150     -7.641      0.000      -1.443      -0.853
avgDeathsPerYear           1.4852      0.231      6.431      0.000       1.032       1.938
incidenceRate              0.6666      0.029     23.053      0.000       0.610       0.723
popEst2015                -0.4309      0.223     -1.936      0.053      -0.867       0.005
MedianAgeFemale           -0.0686      0.016     -4.301      0.000      -0.100      -0.037
PercentMarried             0.0507      0.027      1.886      0.059      -0.002       0.103
PctHS18_24                 0.0439      0.012      3.652      0.000       0.020       0.067
PctHS25_Over               0.0463      0.018      2.587      0.010       0.011       0.081
PctBachDeg25_Over         -0.1489      0.021     -7.145      0.000      -0.190      -0.108
PctUnemployed16_Over       0.0672      0.018      3.636      0.000       0.031       0.103
PctPublicCoverageAlone     0.0632      0.017      3.749      0.000       0.030       0.096
PctWhite                  -0.0532      0.021     -2.492      0.013      -0.095      -0.011
PctBlack                  -0.0621      0.021     -2.891      0.004      -0.104      -0.020
PctOtherRace              -0.0891      0.021     -4.262      0.000      -0.130      -0.048
PctMarriedHouseholds      -0.1063      0.027     -3.911      0.000      -0.160      -0.053
BirthRate                 -0.0518      0.016     -3.269      0.001      -0.083      -0.021
[22640, 34218.1]           0.0252      0.006      4.433      0.000       0.014       0.036
Alaska                     0.0566      0.021      2.731      0.006       0.016       0.097
Arizona                   -0.0635      0.020     -3.122      0.002      -0.103      -0.024
Arkansas                   0.0379      0.010      3.973      0.000       0.019       0.057
California                -0.0485      0.011     -4.252      0.000      -0.071      -0.026
Colorado                  -0.0596      0.011     -5.438      0.000      -0.081      -0.038
Connecticut               -0.0559      0.029     -1.902      0.057      -0.114       0.002
Georgia                   -0.0233      0.007     -3.413      0.001      -0.037      -0.010
Hawaii                    -0.1157      0.040     -2.861      0.004      -0.195      -0.036
Idaho                     -0.0577      0.012     -4.673      0.000      -0.082      -0.033
Iowa                      -0.0397      0.008     -4.963      0.000      -0.055      -0.024
Kansas                     0.0340      0.010      3.289      0.001       0.014       0.054
Kentucky                   0.0335      0.008      4.249      0.000       0.018       0.049
Massachusetts             -0.0450      0.020     -2.222      0.026      -0.085      -0.005
Mississippi                0.0228      0.010      2.358      0.018       0.004       0.042
Montana                   -0.0600      0.012     -4.924      0.000      -0.084      -0.036
Nebraska                  -0.0281      0.009     -2.994      0.003      -0.047      -0.010
New Mexico                -0.0651      0.015     -4.433      0.000      -0.094      -0.036
New York                  -0.0550      0.010     -5.460      0.000      -0.075      -0.035
North Carolina            -0.0235      0.008     -2.807      0.005      -0.040      -0.007
Oklahoma                   0.0351      0.009      3.716      0.000       0.017       0.054
Oregon                    -0.0377      0.013     -2.976      0.003      -0.063      -0.013
Pennsylvania              -0.0443      0.010     -4.360      0.000      -0.064      -0.024
South Dakota              -0.0313      0.011     -2.953      0.003      -0.052      -0.010
Tennessee                  0.0232      0.008      2.816      0.005       0.007       0.039
Utah                      -0.0975      0.017     -5.757      0.000      -0.131      -0.064
Virginia                   0.0205      0.007      2.730      0.006       0.006       0.035
Washington                -0.0282      0.013     -2.222      0.026      -0.053      -0.003
==============================================================================
Omnibus:                      195.423   Durbin-Watson:                   1.999
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1172.576
Skew:                           0.098   Prob(JB):                    2.39e-255
Kurtosis:                       6.504   Cond. No.                         417.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

popEst2015 0.053
PercentMarried 0.059
Connecticut 0.057
delete them

In [50]:
X_train_6=X_train_5.drop(['popEst2015','PercentMarried','Connecticut'],axis=1)
X_train_6=sm.add_constant(X_train_6)
lm6=sm.OLS(y_train,X_train_6).fit()
print(lm6.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:       TARGET_deathRate   R-squared:                       0.579
Model:                            OLS   Adj. R-squared:                  0.571
Method:                 Least Squares   F-statistic:                     75.24
Date:                Wed, 12 Aug 2020   Prob (F-statistic):               0.00
Time:                        21:21:02   Log-Likelihood:                 3016.0
No. Observations:                2285   AIC:                            -5948.
Df Residuals:                    2243   BIC:                            -5707.
Df Model:                          41                                         
Covariance Type:            nonrobust                                         
==========================================================================================
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                      0.2517      0.030      8.321      0.000       0.192       0.311
avgAnnCount               -1.1731      0.149     -7.863      0.000      -1.466      -0.881
avgDeathsPerYear           1.1333      0.150      7.555      0.000       0.839       1.427
incidenceRate              0.6718      0.029     23.306      0.000       0.615       0.728
MedianAgeFemale           -0.0516      0.014     -3.649      0.000      -0.079      -0.024
PctHS18_24                 0.0453      0.012      3.775      0.000       0.022       0.069
PctHS25_Over               0.0437      0.018      2.439      0.015       0.009       0.079
PctBachDeg25_Over         -0.1486      0.021     -7.128      0.000      -0.189      -0.108
PctUnemployed16_Over       0.0641      0.018      3.493      0.000       0.028       0.100
PctPublicCoverageAlone     0.0633      0.017      3.746      0.000       0.030       0.096
PctWhite                  -0.0471      0.021     -2.217      0.027      -0.089      -0.005
PctBlack                  -0.0634      0.021     -2.958      0.003      -0.105      -0.021
PctOtherRace              -0.0900      0.021     -4.311      0.000      -0.131      -0.049
PctMarriedHouseholds      -0.0700      0.017     -4.050      0.000      -0.104      -0.036
BirthRate                 -0.0458      0.016     -2.928      0.003      -0.076      -0.015
[22640, 34218.1]           0.0242      0.006      4.270      0.000       0.013       0.035
Alaska                     0.0568      0.021      2.738      0.006       0.016       0.097
Arizona                   -0.0641      0.020     -3.146      0.002      -0.104      -0.024
Arkansas                   0.0393      0.010      4.137      0.000       0.021       0.058
California                -0.0528      0.011     -4.704      0.000      -0.075      -0.031
Colorado                  -0.0606      0.011     -5.531      0.000      -0.082      -0.039
Georgia                   -0.0237      0.007     -3.478      0.001      -0.037      -0.010
Hawaii                    -0.1150      0.041     -2.839      0.005      -0.194      -0.036
Idaho                     -0.0564      0.012     -4.575      0.000      -0.081      -0.032
Iowa                      -0.0385      0.008     -4.830      0.000      -0.054      -0.023
Kansas                     0.0371      0.010      3.591      0.000       0.017       0.057
Kentucky                   0.0338      0.008      4.293      0.000       0.018       0.049
Massachusetts             -0.0433      0.020     -2.148      0.032      -0.083      -0.004
Mississippi                0.0235      0.010      2.443      0.015       0.005       0.042
Montana                   -0.0603      0.012     -4.942      0.000      -0.084      -0.036
Nebraska                  -0.0262      0.009     -2.812      0.005      -0.044      -0.008
New Mexico                -0.0679      0.015     -4.636      0.000      -0.097      -0.039
New York                  -0.0560      0.010     -5.593      0.000      -0.076      -0.036
North Carolina            -0.0235      0.008     -2.807      0.005      -0.040      -0.007
Oklahoma                   0.0369      0.009      3.914      0.000       0.018       0.055
Oregon                    -0.0372      0.013     -2.932      0.003      -0.062      -0.012
Pennsylvania              -0.0433      0.010     -4.303      0.000      -0.063      -0.024
South Dakota              -0.0304      0.011     -2.876      0.004      -0.051      -0.010
Tennessee                  0.0239      0.008      2.900      0.004       0.008       0.040
Utah                      -0.0985      0.017     -5.813      0.000      -0.132      -0.065
Virginia                   0.0192      0.007      2.565      0.010       0.005       0.034
Washington                -0.0283      0.013     -2.226      0.026      -0.053      -0.003
==============================================================================
Omnibus:                      198.194   Durbin-Watson:                   1.995
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1225.547
Skew:                           0.081   Prob(JB):                    7.51e-267
Kurtosis:                       6.584   Cond. No.                         273.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [51]:
X_train_6.columns
Out[51]:
Index(['const', 'avgAnnCount', 'avgDeathsPerYear', 'incidenceRate',
       'MedianAgeFemale', 'PctHS18_24', 'PctHS25_Over', 'PctBachDeg25_Over',
       'PctUnemployed16_Over', 'PctPublicCoverageAlone', 'PctWhite',
       'PctBlack', 'PctOtherRace', 'PctMarriedHouseholds', 'BirthRate',
       '[22640, 34218.1]', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Georgia', 'Hawaii', 'Idaho', 'Iowa', 'Kansas', 'Kentucky',
       'Massachusetts', 'Mississippi', 'Montana', 'Nebraska', 'New Mexico',
       'New York', 'North Carolina', 'Oklahoma', 'Oregon', 'Pennsylvania',
       'South Dakota', 'Tennessee', 'Utah', 'Virginia', 'Washington'],
      dtype='object')

calculate vif

In [52]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif=pd.DataFrame()
X=X_train_6
vif['features']=X.columns
vif['VIF']=[variance_inflation_factor(X.values,i) for i in range(X.shape[1])]
vif['VIF']=round(vif['VIF'],2)
vif=vif.sort_values(by = "VIF",ascending=False)
vif
Out[52]:
features VIF
0 const 491.17
1 avgAnnCount 17.78
2 avgDeathsPerYear 17.14
10 PctWhite 8.44
11 PctBlack 7.34
7 PctBachDeg25_Over 4.56
6 PctHS25_Over 4.07
9 PctPublicCoverageAlone 3.74
8 PctUnemployed16_Over 2.65
13 PctMarriedHouseholds 2.30
25 Kansas 1.86
12 PctOtherRace 1.79
4 MedianAgeFemale 1.60
15 [22640, 34218.1] 1.53
5 PctHS18_24 1.50
28 Mississippi 1.34
3 incidenceRate 1.31
16 Alaska 1.31
19 California 1.28
26 Kentucky 1.25
21 Georgia 1.24
31 New Mexico 1.20
34 Oklahoma 1.20
37 South Dakota 1.19
14 BirthRate 1.16
40 Virginia 1.15
22 Hawaii 1.15
36 Pennsylvania 1.14
33 North Carolina 1.14
30 Nebraska 1.13
20 Colorado 1.13
24 Iowa 1.13
18 Arkansas 1.12
38 Tennessee 1.12
32 New York 1.08
35 Oregon 1.08
41 Washington 1.08
29 Montana 1.07
17 Arizona 1.07
39 Utah 1.07
23 Idaho 1.06
27 Massachusetts 1.05
In [53]:
a=X.drop(['avgAnnCount','avgDeathsPerYear','PctWhite','PctBlack'],axis=1)
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif2=pd.DataFrame()
vif2['features']=a.columns
vif2['VIF']=[variance_inflation_factor(a.values,i) for i in range(a.shape[1])]
vif2['VIF']=round(vif2['VIF'],2)
vif2=vif2.sort_values(by = "VIF",ascending=False)
vif2
Out[53]:
features VIF
0 const 272.85
5 PctBachDeg25_Over 4.33
4 PctHS25_Over 4.05
7 PctPublicCoverageAlone 3.61
6 PctUnemployed16_Over 2.38
9 PctMarriedHouseholds 1.80
2 MedianAgeFemale 1.49
8 PctOtherRace 1.49
3 PctHS18_24 1.48
11 [22640, 34218.1] 1.48
1 incidenceRate 1.30
15 California 1.23
22 Kentucky 1.21
24 Mississippi 1.17
27 New Mexico 1.16
10 BirthRate 1.15
29 North Carolina 1.13
32 Pennsylvania 1.13
17 Georgia 1.12
21 Kansas 1.12
26 Nebraska 1.12
16 Colorado 1.11
34 Tennessee 1.11
20 Iowa 1.11
36 Virginia 1.11
14 Arkansas 1.10
33 South Dakota 1.09
35 Utah 1.07
37 Washington 1.07
28 New York 1.07
12 Alaska 1.07
31 Oregon 1.06
30 Oklahoma 1.05
13 Arizona 1.05
19 Idaho 1.05
25 Montana 1.04
23 Massachusetts 1.04
18 Hawaii 1.01
In [54]:
X_train_7=X_train_6.drop(['avgAnnCount','avgDeathsPerYear','PctWhite','PctBlack'],axis=1)
X_train_7=sm.add_constant(X_train_7)
lm7=sm.OLS(y_train,X_train_7).fit()
print(lm7.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:       TARGET_deathRate   R-squared:                       0.566
Model:                            OLS   Adj. R-squared:                  0.559
Method:                 Least Squares   F-statistic:                     79.34
Date:                Wed, 12 Aug 2020   Prob (F-statistic):               0.00
Time:                        21:21:06   Log-Likelihood:                 2982.4
No. Observations:                2285   AIC:                            -5889.
Df Residuals:                    2247   BIC:                            -5671.
Df Model:                          37                                         
Covariance Type:            nonrobust                                         
==========================================================================================
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                      0.2103      0.023      9.198      0.000       0.165       0.255
incidenceRate              0.6504      0.029     22.384      0.000       0.593       0.707
MedianAgeFemale           -0.0622      0.014     -4.490      0.000      -0.089      -0.035
PctHS18_24                 0.0506      0.012      4.181      0.000       0.027       0.074
PctHS25_Over               0.0428      0.018      2.361      0.018       0.007       0.078
PctBachDeg25_Over         -0.1563      0.021     -7.589      0.000      -0.197      -0.116
PctUnemployed16_Over       0.0788      0.018      4.468      0.000       0.044       0.113
PctPublicCoverageAlone     0.0651      0.017      3.870      0.000       0.032       0.098
PctOtherRace              -0.0733      0.019     -3.800      0.000      -0.111      -0.035
PctMarriedHouseholds      -0.0707      0.016     -4.553      0.000      -0.101      -0.040
BirthRate                 -0.0533      0.016     -3.382      0.001      -0.084      -0.022
[22640, 34218.1]           0.0202      0.006      3.587      0.000       0.009       0.031
Alaska                     0.0799      0.019      4.199      0.000       0.043       0.117
Arizona                   -0.0551      0.021     -2.685      0.007      -0.095      -0.015
Arkansas                   0.0405      0.010      4.230      0.000       0.022       0.059
California                -0.0487      0.011     -4.365      0.000      -0.071      -0.027
Colorado                  -0.0554      0.011     -5.037      0.000      -0.077      -0.034
Georgia                   -0.0265      0.007     -4.033      0.000      -0.039      -0.014
Hawaii                    -0.0823      0.038     -2.145      0.032      -0.158      -0.007
Idaho                     -0.0517      0.012     -4.140      0.000      -0.076      -0.027
Iowa                      -0.0312      0.008     -3.885      0.000      -0.047      -0.015
Kansas                    -0.0126      0.008     -1.558      0.119      -0.029       0.003
Kentucky                   0.0372      0.008      4.732      0.000       0.022       0.053
Massachusetts             -0.0405      0.020     -1.985      0.047      -0.081      -0.000
Mississippi                0.0169      0.009      1.852      0.064      -0.001       0.035
Montana                   -0.0498      0.012     -4.075      0.000      -0.074      -0.026
Nebraska                  -0.0183      0.009     -1.948      0.051      -0.037       0.000
New Mexico                -0.0589      0.015     -4.038      0.000      -0.088      -0.030
New York                  -0.0559      0.010     -5.537      0.000      -0.076      -0.036
North Carolina            -0.0227      0.008     -2.702      0.007      -0.039      -0.006
Oklahoma                   0.0497      0.009      5.554      0.000       0.032       0.067
Oregon                    -0.0306      0.013     -2.403      0.016      -0.056      -0.006
Pennsylvania              -0.0382      0.010     -3.768      0.000      -0.058      -0.018
South Dakota              -0.0164      0.010     -1.594      0.111      -0.037       0.004
Tennessee                  0.0262      0.008      3.147      0.002       0.010       0.043
Utah                      -0.0946      0.017     -5.513      0.000      -0.128      -0.061
Virginia                   0.0217      0.007      2.915      0.004       0.007       0.036
Washington                -0.0217      0.013     -1.700      0.089      -0.047       0.003
==============================================================================
Omnibus:                      192.524   Durbin-Watson:                   1.974
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1126.250
Skew:                           0.106   Prob(JB):                    2.74e-245
Kurtosis:                       6.433   Cond. No.                         43.6
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [55]:
X_train_8=X_train_7.drop(['Washington','South Dakota','Nebraska','Mississippi','Kansas'],axis=1)
X_train_8=sm.add_constant(X_train_8)
lm8=sm.OLS(y_train,X_train_8).fit()
print(lm8.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:       TARGET_deathRate   R-squared:                       0.564
Model:                            OLS   Adj. R-squared:                  0.558
Method:                 Least Squares   F-statistic:                     90.93
Date:                Wed, 12 Aug 2020   Prob (F-statistic):               0.00
Time:                        21:21:06   Log-Likelihood:                 2975.2
No. Observations:                2285   AIC:                            -5884.
Df Residuals:                    2252   BIC:                            -5695.
Df Model:                          32                                         
Covariance Type:            nonrobust                                         
==========================================================================================
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                      0.2093      0.022      9.322      0.000       0.165       0.253
incidenceRate              0.6529      0.029     22.520      0.000       0.596       0.710
MedianAgeFemale           -0.0684      0.014     -4.973      0.000      -0.095      -0.041
PctHS18_24                 0.0545      0.012      4.547      0.000       0.031       0.078
PctHS25_Over               0.0421      0.018      2.384      0.017       0.007       0.077
PctBachDeg25_Over         -0.1582      0.020     -7.834      0.000      -0.198      -0.119
PctUnemployed16_Over       0.0885      0.017      5.112      0.000       0.055       0.122
PctPublicCoverageAlone     0.0681      0.017      4.086      0.000       0.035       0.101
PctOtherRace              -0.0779      0.019     -4.080      0.000      -0.115      -0.040
PctMarriedHouseholds      -0.0711      0.016     -4.586      0.000      -0.102      -0.041
BirthRate                 -0.0616      0.015     -3.996      0.000      -0.092      -0.031
[22640, 34218.1]           0.0207      0.006      3.692      0.000       0.010       0.032
Alaska                     0.0795      0.019      4.178      0.000       0.042       0.117
Arizona                   -0.0546      0.021     -2.661      0.008      -0.095      -0.014
Arkansas                   0.0408      0.010      4.271      0.000       0.022       0.060
California                -0.0479      0.011     -4.332      0.000      -0.070      -0.026
Colorado                  -0.0530      0.011     -4.840      0.000      -0.075      -0.032
Georgia                   -0.0266      0.007     -4.082      0.000      -0.039      -0.014
Hawaii                    -0.0802      0.038     -2.087      0.037      -0.156      -0.005
Idaho                     -0.0496      0.012     -3.979      0.000      -0.074      -0.025
Iowa                      -0.0278      0.008     -3.496      0.000      -0.043      -0.012
Kentucky                   0.0368      0.008      4.712      0.000       0.021       0.052
Massachusetts             -0.0384      0.020     -1.880      0.060      -0.078       0.002
Montana                   -0.0466      0.012     -3.822      0.000      -0.071      -0.023
New Mexico                -0.0575      0.015     -3.950      0.000      -0.086      -0.029
New York                  -0.0543      0.010     -5.402      0.000      -0.074      -0.035
North Carolina            -0.0223      0.008     -2.675      0.008      -0.039      -0.006
Oklahoma                   0.0516      0.009      5.772      0.000       0.034       0.069
Oregon                    -0.0297      0.013     -2.337      0.020      -0.055      -0.005
Pennsylvania              -0.0364      0.010     -3.593      0.000      -0.056      -0.017
Tennessee                  0.0258      0.008      3.107      0.002       0.010       0.042
Utah                      -0.0924      0.017     -5.385      0.000      -0.126      -0.059
Virginia                   0.0236      0.007      3.188      0.001       0.009       0.038
==============================================================================
Omnibus:                      198.759   Durbin-Watson:                   1.971
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1209.066
Skew:                           0.107   Prob(JB):                    2.85e-263
Kurtosis:                       6.557   Cond. No.                         43.6
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [56]:
X_train_9=X_train_8.drop(['Massachusetts'],axis=1)
X_train_9=sm.add_constant(X_train_9)
lm9=sm.OLS(y_train,X_train_9).fit()
print(lm9.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:       TARGET_deathRate   R-squared:                       0.563
Model:                            OLS   Adj. R-squared:                  0.557
Method:                 Least Squares   F-statistic:                     93.64
Date:                Wed, 12 Aug 2020   Prob (F-statistic):               0.00
Time:                        21:21:06   Log-Likelihood:                 2973.5
No. Observations:                2285   AIC:                            -5883.
Df Residuals:                    2253   BIC:                            -5699.
Df Model:                          31                                         
Covariance Type:            nonrobust                                         
==========================================================================================
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                      0.2121      0.022      9.467      0.000       0.168       0.256
incidenceRate              0.6500      0.029     22.439      0.000       0.593       0.707
MedianAgeFemale           -0.0699      0.014     -5.090      0.000      -0.097      -0.043
PctHS18_24                 0.0546      0.012      4.548      0.000       0.031       0.078
PctHS25_Over               0.0422      0.018      2.390      0.017       0.008       0.077
PctBachDeg25_Over         -0.1621      0.020     -8.068      0.000      -0.201      -0.123
PctUnemployed16_Over       0.0889      0.017      5.137      0.000       0.055       0.123
PctPublicCoverageAlone     0.0653      0.017      3.935      0.000       0.033       0.098
PctOtherRace              -0.0796      0.019     -4.173      0.000      -0.117      -0.042
PctMarriedHouseholds      -0.0712      0.016     -4.589      0.000      -0.102      -0.041
BirthRate                 -0.0608      0.015     -3.942      0.000      -0.091      -0.031
[22640, 34218.1]           0.0210      0.006      3.730      0.000       0.010       0.032
Alaska                     0.0794      0.019      4.165      0.000       0.042       0.117
Arizona                   -0.0541      0.021     -2.634      0.008      -0.094      -0.014
Arkansas                   0.0411      0.010      4.306      0.000       0.022       0.060
California                -0.0468      0.011     -4.238      0.000      -0.068      -0.025
Colorado                  -0.0521      0.011     -4.756      0.000      -0.074      -0.031
Georgia                   -0.0264      0.007     -4.062      0.000      -0.039      -0.014
Hawaii                    -0.0795      0.038     -2.068      0.039      -0.155      -0.004
Idaho                     -0.0494      0.012     -3.962      0.000      -0.074      -0.025
Iowa                      -0.0274      0.008     -3.453      0.001      -0.043      -0.012
Kentucky                   0.0371      0.008      4.751      0.000       0.022       0.052
Montana                   -0.0460      0.012     -3.776      0.000      -0.070      -0.022
New Mexico                -0.0565      0.015     -3.881      0.000      -0.085      -0.028
New York                  -0.0536      0.010     -5.331      0.000      -0.073      -0.034
North Carolina            -0.0218      0.008     -2.616      0.009      -0.038      -0.005
Oklahoma                   0.0519      0.009      5.807      0.000       0.034       0.069
Oregon                    -0.0290      0.013     -2.285      0.022      -0.054      -0.004
Pennsylvania              -0.0361      0.010     -3.558      0.000      -0.056      -0.016
Tennessee                  0.0261      0.008      3.138      0.002       0.010       0.042
Utah                      -0.0924      0.017     -5.381      0.000      -0.126      -0.059
Virginia                   0.0239      0.007      3.229      0.001       0.009       0.038
==============================================================================
Omnibus:                      198.932   Durbin-Watson:                   1.973
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1207.585
Skew:                           0.111   Prob(JB):                    5.97e-263
Kurtosis:                       6.555   Cond. No.                         43.6
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [57]:
vif=pd.DataFrame()
X=X_train_9
vif['features']=X.columns
vif['VIF']=[variance_inflation_factor(X.values,i) for i in range(X.shape[1])]
vif['VIF']=round(vif['VIF'],2)
vif=vif.sort_values(by = "VIF",ascending=False)
vif
Out[57]:
features VIF
0 const 260.84
5 PctBachDeg25_Over 4.10
4 PctHS25_Over 3.83
7 PctPublicCoverageAlone 3.50
6 PctUnemployed16_Over 2.28
9 PctMarriedHouseholds 1.79
2 MedianAgeFemale 1.46
11 [22640, 34218.1] 1.46
8 PctOtherRace 1.45
3 PctHS18_24 1.45
1 incidenceRate 1.28
15 California 1.20
21 Kentucky 1.18
23 New Mexico 1.14
28 Pennsylvania 1.12
29 Tennessee 1.10
25 North Carolina 1.10
16 Colorado 1.10
10 BirthRate 1.10
17 Georgia 1.09
14 Arkansas 1.09
31 Virginia 1.09
20 Iowa 1.08
12 Alaska 1.07
30 Utah 1.07
24 New York 1.06
19 Idaho 1.05
13 Arizona 1.05
27 Oregon 1.05
26 Oklahoma 1.04
22 Montana 1.03
18 Hawaii 1.01

Residual analysis of training data

In [58]:
y_train_target=lm9.predict(X_train_9)
In [59]:
fig = plt.figure()
ax=sns.distplot((y_train-y_train_target),bins=20)
fig.suptitle("Error Terms",fontsize=20)
plt.xlabel('Errors',fontsize=20)
Out[59]:
Text(0.5, 0, 'Errors')
In [60]:
ax1 = sns.distplot(y_train, hist=False, color="r", label="Actual Value")
sns.distplot(y_train_target, hist=False, color="b", label="Fitted Values" , ax=ax1)


plt.title('Actual vs Fitted Values for TARGET_deathRate')


plt.show()
plt.close()

We can see that the fitted values are reasonably close to the actual values, since the two distributions overlap a bit. However, there is definitely some room for improvement.

making predictions

applying scalign to test dataset

In [61]:
convars=['avgAnnCount','avgDeathsPerYear','TARGET_deathRate','incidenceRate','medIncome','popEst2015','povertyPercent','studyPerCap','MedianAge','MedianAgeMale','MedianAgeFemale','AvgHouseholdSize','PercentMarried','PctNoHS18_24','PctHS18_24','PctBachDeg18_24','PctHS25_Over','PctBachDeg25_Over','PctEmployed16_Over','PctUnemployed16_Over','PctPrivateCoverage','PctPrivateCoverageAlone','PctEmpPrivCoverage','PctPublicCoverage','PctPublicCoverageAlone','PctWhite','PctBlack','PctAsian','PctOtherRace','PctMarriedHouseholds','BirthRate']
df_test[convars]=scaler.transform(df_test[convars])
C:\Users\kanan\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
C:\Users\kanan\AppData\Roaming\Python\Python36\site-packages\pandas\core\indexing.py:966: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
In [62]:
df_test.describe()
Out[62]:
avgAnnCount avgDeathsPerYear TARGET_deathRate incidenceRate medIncome popEst2015 povertyPercent studyPerCap MedianAge MedianAgeMale ... South Dakota Tennessee Texas Utah Vermont Virginia Washington West Virginia Wisconsin Wyoming
count 762.000000 762.000000 762.000000 762.000000 762.000000 762.000000 762.000000 762.000000 762.000000 762.000000 ... 762.000000 762.000000 762.000000 762.000000 762.000000 762.000000 762.000000 762.000000 762.000000 762.000000
mean 0.015511 0.013132 0.326566 0.242772 0.248876 0.010343 0.298906 0.018557 0.036767 0.402934 ... 0.017060 0.030184 0.068241 0.014436 0.003937 0.044619 0.013123 0.015748 0.019685 0.006562
std 0.032412 0.030410 0.104275 0.054162 0.124323 0.029314 0.145111 0.053997 0.071815 0.120952 ... 0.129581 0.171205 0.252325 0.119357 0.062663 0.206602 0.113878 0.124581 0.139007 0.080791
min -0.000026 0.000000 -0.101381 0.009745 0.016250 -0.000047 -0.011442 0.000000 0.003490 0.037825 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.001920 0.001785 0.262173 0.214822 0.159736 0.001072 0.192220 0.000000 0.025262 0.328605 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 0.004876 0.004783 0.328488 0.250845 0.229048 0.002890 0.279176 0.000000 0.030580 0.397163 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 0.013626 0.011405 0.391261 0.278192 0.310459 0.007601 0.379863 0.011630 0.035400 0.472813 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
max 0.379362 0.364461 0.677326 0.447494 1.029940 0.446133 0.990847 0.697626 0.992023 0.782506 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

8 rows × 90 columns

divide it into X_test and y_test

In [63]:
y_test=df_test.pop('TARGET_deathRate')
X_test=df_test
In [64]:
y_test.shape
Out[64]:
(762,)
In [65]:
X_test.shape
Out[65]:
(762, 89)
In [71]:
X_train_9.columns
Out[71]:
Index(['const', 'incidenceRate', 'MedianAgeFemale', 'PctHS18_24',
       'PctHS25_Over', 'PctBachDeg25_Over', 'PctUnemployed16_Over',
       'PctPublicCoverageAlone', 'PctOtherRace', 'PctMarriedHouseholds',
       'BirthRate', '[22640, 34218.1]', 'Alaska', 'Arizona', 'Arkansas',
       'California', 'Colorado', 'Georgia', 'Hawaii', 'Idaho', 'Iowa',
       'Kentucky', 'Montana', 'New Mexico', 'New York', 'North Carolina',
       'Oklahoma', 'Oregon', 'Pennsylvania', 'Tennessee', 'Utah', 'Virginia'],
      dtype='object')
In [72]:
X_train_9.drop(['const'],inplace=True,axis=1)
In [73]:
X_test_m1=X_test[X_train_9.columns]
X_test_m1=sm.add_constant(X_test_m1)
In [74]:
X_test_m1.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 762 entries, 147 to 1280
Data columns (total 32 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   const                   762 non-null    float64
 1   incidenceRate           762 non-null    float64
 2   MedianAgeFemale         762 non-null    float64
 3   PctHS18_24              762 non-null    float64
 4   PctHS25_Over            762 non-null    float64
 5   PctBachDeg25_Over       762 non-null    float64
 6   PctUnemployed16_Over    762 non-null    float64
 7   PctPublicCoverageAlone  762 non-null    float64
 8   PctOtherRace            762 non-null    float64
 9   PctMarriedHouseholds    762 non-null    float64
 10  BirthRate               762 non-null    float64
 11  [22640, 34218.1]        762 non-null    uint8  
 12  Alaska                  762 non-null    uint8  
 13  Arizona                 762 non-null    uint8  
 14  Arkansas                762 non-null    uint8  
 15  California              762 non-null    uint8  
 16  Colorado                762 non-null    uint8  
 17  Georgia                 762 non-null    uint8  
 18  Hawaii                  762 non-null    uint8  
 19  Idaho                   762 non-null    uint8  
 20  Iowa                    762 non-null    uint8  
 21  Kentucky                762 non-null    uint8  
 22  Montana                 762 non-null    uint8  
 23  New Mexico              762 non-null    uint8  
 24  New York                762 non-null    uint8  
 25  North Carolina          762 non-null    uint8  
 26  Oklahoma                762 non-null    uint8  
 27  Oregon                  762 non-null    uint8  
 28  Pennsylvania            762 non-null    uint8  
 29  Tennessee               762 non-null    uint8  
 30  Utah                    762 non-null    uint8  
 31  Virginia                762 non-null    uint8  
dtypes: float64(11), uint8(21)
memory usage: 87.1 KB
In [76]:
y_pred_m1=lm9.predict(X_test_m1)

model evaluation

In [77]:
fig=plt.figure()
plt.scatter(y_test,y_pred_m1)
fig.suptitle('y_test vs y_pred',fontsize=20)
plt.xlabel('y_test',fontsize=18)
plt.ylabel('y_pred',fontsize=18)
Out[77]:
Text(0, 0.5, 'y_pred')
In [78]:
ax1 = sns.distplot(y_test, hist=False, color="r", label="Actual Value")
sns.distplot(y_pred_m1, hist=False, color="b", label="Fitted Values" , ax=ax1)


plt.title('Actual vs Fitted Values of Test Dataset for TARGET_deathRate')


plt.show()
plt.close()
In [79]:
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_m1))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_m1))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_m1)))
Mean Absolute Error: 0.0503361345116567
Mean Squared Error: 0.0047229149770521995
Root Mean Squared Error: 0.06872346744054901
In [80]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_m1})
df1 = df.head(25)
In [83]:
df1.head()
Out[83]:
Actual Predicted
147 0.208939 0.238352
2304 0.201672 0.291258
962 0.310320 0.324235
821 0.429142 0.346017
221 0.413881 0.434675
In [82]:
df1.plot(kind='bar',figsize=(10,8))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

Overall we have a decent model, but we also acknowledge that we could do better.

We have a couple of options:

Add new features (avgAnnCount/avgDeathsPerYear/PctWhite etc.) Build a non-linear model

Created by Anand mohan

18bcs6218 Aiml-2